Setup

In [1]:
# supress the display of warning messages
import warnings
warnings.filterwarnings('ignore')
In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
import sklearn as sk

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import export_text

from sklearn.model_selection import train_test_split, cross_validate,\
GridSearchCV, cross_val_score, KFold, ParameterGrid

from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support,\
accuracy_score, recall_score, precision_score, f1_score,\
confusion_matrix, classification_report

from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier,\
BaggingClassifier, AdaBoostClassifier
In [4]:
# install imbalanced-learn package
!pip install -U imbalanced-learn
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.7/dist-packages (0.8.1)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
     |████████████████████████████████| 199 kB 5.3 MB/s 
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.7/dist-packages (from imbalanced-learn) (1.7.3)
Requirement already satisfied: joblib>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from imbalanced-learn) (1.2.0)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from imbalanced-learn) (1.21.6)
  Downloading imbalanced_learn-0.9.0-py3-none-any.whl (199 kB)
     |████████████████████████████████| 199 kB 40.8 MB/s 
Requirement already satisfied: scikit-learn>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from imbalanced-learn) (1.0.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from imbalanced-learn) (3.1.0)
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.8.1
    Uninstalling imbalanced-learn-0.8.1:
      Successfully uninstalled imbalanced-learn-0.8.1
Successfully installed imbalanced-learn-0.9.0
In [5]:
# import samplers and classifiers from imblearn
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT

# from collections import Counter
# from sklearn.datasets import make_classification
# from sklearn.svm import LinearSVC

from imblearn.pipeline import make_pipeline 
from imblearn.over_sampling import (RandomOverSampler, ADASYN, 
                                    SMOTE, BorderlineSMOTE, SVMSMOTE)
from imblearn.under_sampling import (RandomUnderSampler,
                                     ClusterCentroids,
                                     NearMiss,
                                     InstanceHardnessThreshold,
                                     CondensedNearestNeighbour,
                                     EditedNearestNeighbours,
                                     RepeatedEditedNearestNeighbours,
                                     AllKNN,
                                     NeighbourhoodCleaningRule,
                                     OneSidedSelection)
from imblearn.combine import (SMOTEENN, SMOTETomek)
from imblearn.ensemble import (BalancedBaggingClassifier,
                               BalancedRandomForestClassifier,
                               EasyEnsembleClassifier,
                               RUSBoostClassifier)
# from imblearn.base import BaseSampler
In [6]:
# Mount the drive - Must be done each time session expires.
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [7]:
# set seed so that results are reproducible 
np.random.seed(123456)
In [8]:
# import the dataset
df = pd.read_csv('/content/drive/My Drive/carAuction.csv')
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Auction                        10000 non-null  object
 1   Color                          10000 non-null  object
 2   IsBadBuy                       10000 non-null  object
 3   MMRCurrentAuctionAveragePrice  10000 non-null  int64 
 4   Size                           10000 non-null  object
 5   TopThreeAmericanName           10000 non-null  object
 6   VehBCost                       10000 non-null  int64 
 7   VehicleAge                     10000 non-null  int64 
 8   VehOdo                         10000 non-null  int64 
 9   WarrantyCost                   10000 non-null  int64 
 10  WheelType                      10000 non-null  object
dtypes: int64(5), object(6)
memory usage: 859.5+ KB
In [10]:
# null value counts for each column
print('Null value counts:', "\n", df.isnull().sum())
Null value counts: 
 Auction                          0
Color                            0
IsBadBuy                         0
MMRCurrentAuctionAveragePrice    0
Size                             0
TopThreeAmericanName             0
VehBCost                         0
VehicleAge                       0
VehOdo                           0
WarrantyCost                     0
WheelType                        0
dtype: int64
In [11]:
# distribution of the target variable
y_counts = df['IsBadBuy'].value_counts()
y_counts
Out[11]:
No     8705
Yes    1295
Name: IsBadBuy, dtype: int64
In [12]:
# save the indices from value_counts for use in the print statement below
y_levels = y_counts.index
y_levels
Out[12]:
Index(['No', 'Yes'], dtype='object')

Create two datasets with increased imbalance for experimentation

In [13]:
# All the "yes" instances in the dataset
yes_df = df[df.IsBadBuy.str.match('Yes')]
yes_df.shape
Out[13]:
(1295, 11)
In [14]:
# Randomly sample from the yeses -- yes_df1 will have 25% of the yeses, yes_df2 will have 75% of the yeses
yes_df1, yes_df2 = train_test_split(yes_df, train_size = 0.25)
yes_df1.shape
Out[14]:
(323, 11)
In [15]:
df_imb1 = df[~df.index.isin(yes_df1.index)].copy() # subtract 25% of the yeses
df_imb2 = df[~df.index.isin(yes_df2.index)].copy() # subtract 75% of the yeses
In [16]:
df_imb1.shape
Out[16]:
(9677, 11)
In [17]:
df_imb2.shape
Out[17]:
(9028, 11)
In [18]:
# distribution of the target variable in the two new datasets
y_levels = y_counts.index
y_levels

imb1_y_counts = df_imb1['IsBadBuy'].value_counts()
imb2_y_counts = df_imb2['IsBadBuy'].value_counts()

print("In the original dataset:", "\n")
for i in y_levels:
  print(f"{round(y_counts[i]/len(df)*100, 2)}% is {i}", "\n")

print("\n")

print("In df_imb1:", "\n")
for i in y_levels:
  print(f"{round(imb1_y_counts[i]/len(df)*100, 2)}% is {i}", "\n")

print("\n")

print("In df_imb2:", "\n")
for i in y_levels:
  print(f"{round(imb2_y_counts[i]/len(df)*100, 2)}% is {i}", "\n")
In the original dataset: 

87.05% is No 

12.95% is Yes 



In df_imb1: 

87.05% is No 

9.72% is Yes 



In df_imb2: 

87.05% is No 

3.23% is Yes 

Create sampler object lists

In [19]:
# oversamplers from imblearn.over_sample
osampler_list = [RandomOverSampler(random_state=42),
                 ADASYN(random_state=42),
                 SMOTE(random_state=42),
                 SVMSMOTE(random_state=42),
                 BorderlineSMOTE(random_state=42)]
osampler_name_list = ['ROS','ADASYN','SMOTE','SVMSMOTE','BorderlineSMOTE']
In [20]:
# undersamplers from imblearn.under_sample
usampler_list = [RandomUnderSampler(random_state=42),
                 NearMiss(version=1),
                 NearMiss(version=2),
                 NearMiss(version=3),
#                 TomekLinks(),
                 EditedNearestNeighbours(),
                 RepeatedEditedNearestNeighbours(),
                 AllKNN()]
usampler_name_list = ['RUS','NearMiss1','NearMiss2','NearMiss3',
                      'ENN','RENN','ALLKNN']
In [21]:
# combined samplers from imblearn.combine
csampler_list = [SMOTEENN(random_state=42),
                 SMOTETomek(random_state=42)]
csampler_name_list = ['SMOTEENN', 'SMOTETomek']

Define classifier objects and create lists of their names

In [22]:
# Balanced ensemble classifiers from imblearn.ensemble
bbc = BalancedBaggingClassifier(n_estimators=10, 
                                base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=42)
# brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
rusboost = RUSBoostClassifier(n_estimators=50, algorithm='SAMME.R',
                              random_state=42)
eec = EasyEnsembleClassifier(n_estimators=10, random_state=42)
be_clf_list = [bbc,rusboost,eec]
be_clf_name_list = ['Balanced_Bagging','RUSBoost','EasyEnsembleClassifier']
In [23]:
# five different classifiers to test out 

# set hyperparameters for the first iteration of the decision tree classifier
DTC1 = DecisionTreeClassifier(criterion='entropy',max_depth=4,random_state=42)

# then list the five classifiers
clf_list1 = [DTC1, GaussianNB(),MLPClassifier(),SVC(),KNeighborsClassifier()]
clf_name_list1 = ['DTC1','NB_default','MLP_default','SVC_default','KNN_default']
In [24]:
# logistic regression classifiers
lr_lbfgs = LogisticRegression(random_state=42)
lr_lbfgs_lowerC = LogisticRegression(C=0.1, random_state=42)
lr_newton = LogisticRegression(solver='newton-cg',random_state=42)
lr_lib = LogisticRegression(solver='liblinear',random_state=42) 
lr_lib_lowerC = LogisticRegression(C=0.1,solver='liblinear',random_state=42) 
lr_lib_l1 = LogisticRegression(solver='liblinear',penalty = 'l1',random_state=42) 
lr_sag = LogisticRegression(solver='sag',random_state=42)
lr_saga = LogisticRegression(solver='saga',random_state=42)

lr_list = [lr_lbfgs,lr_newton,lr_sag,lr_saga, lr_lib, lr_lib_lowerC,lr_lib_l1,]
lr_name_list = ['lbfgs_l2','newton_l2','sag_l2','saga_l2','lib_l2','lib_lowerC','lib_l1',]
In [25]:
# bagging classifiers
bag_list=[BaggingClassifier(random_state=42),
          BaggingClassifier(n_estimators=20,random_state=42),
          BaggingClassifier(base_estimator=SVC(),random_state=42),
          BaggingClassifier(base_estimator=LogisticRegression(),random_state=42)]
bag_name_list=['Bagging_default','Bagging_20','Bagging_SVC','Bagging_lr']
In [26]:
# boosting classifiers
boost_list = [AdaBoostClassifier(), AdaBoostClassifier(learning_rate=0.5), 
              AdaBoostClassifier(base_estimator = LogisticRegression(), n_estimators=15,random_state=42),
              AdaBoostClassifier(base_estimator = LogisticRegression(), n_estimators=15,learning_rate=0.5,random_state=42)]
boost_name_list = ['Ada_default','Ada_dt_halflearning','Ada_lr_15','Ada_lr_15_halflearning']
In [27]:
# random forest classifiers
rf_list = [RandomForestClassifier(random_state=42), 
           RandomForestClassifier(criterion='entropy',random_state=42), 
           RandomForestClassifier(n_estimators=50,random_state=42),
           RandomForestClassifier(criterion='entropy',n_estimators=50,random_state=42),
           RandomForestClassifier(max_depth=7,random_state=42),
           RandomForestClassifier(criterion='entropy',max_depth=7,random_state=42)]
rf_name_list = ['rf_default','rf_entropy_default','rf_50','rf_entropy_50','rf_max7','rf_entropy_max7']
In [28]:
# a combination of the classifiers above with a variety of different hyperparameters
lr_lib_l1_lowerc = LogisticRegression(C=0.1,solver='liblinear',penalty = 'l1', random_state=42)
Ada_lr_15_halflearn = AdaBoostClassifier(base_estimator = LogisticRegression(), n_estimators=15,learning_rate=0.5,random_state=42)
rf_max7 = RandomForestClassifier(max_depth=7,random_state=42)
Bagging_20 = BaggingClassifier(n_estimators=20,random_state=42)
clf_list4 = [DTC1,SVC(), 
            lr_lib_l1_lowerc,
            Bagging_20,
            Ada_lr_15_halflearn,
            rf_max7]
clf_name_list4 = ['DTC1','SVC_default','lr_lib_l1_lowerc','Bagging_20','Ada_lr_15_halflearn','rf_max7']

Encode categorical variables

In [29]:
def fun_enc(df, df_name):
  # separate target variable and features
  y_df = df['IsBadBuy'].copy()
  X_df = df.drop('IsBadBuy', axis=1).copy()

  # separate numeric and catogorical features
  X_num_df = X_df.select_dtypes(exclude=['object']).copy()
  X_cat_df = X_df.select_dtypes(include=['object']).copy()
  X_cat_list = X_cat_df.columns.tolist()
  X_col_list = X_cat_list + X_num_df.columns.tolist()

  enc_col_df_list =[] # list of encoded dataframes for use in pd.concat()
  enc_col_counts = [] # list of number of encoded features per original feature
  
  # encode the categorical features
  for i in X_cat_list:
    dummy_df = pd.get_dummies(X_cat_df[i]) # create an encoded dataframe for each categorical feature
    enc_col_counts.append(dummy_df.shape[1]) # add the number of columns from the dataframe the list of counts
    enc_col_df_list.append(dummy_df) # add the newly encoded dataframe to the list of dataframes 
  
  # add in the numeric features
  for i in X_num_df:
    enc_col_counts.append(1) # each numeric feature only needs one dummy variable
    enc_col_df_list.append(df[i]) # add the numeric columns to the final list of encoded columns
  X_enc_df = pd.concat(enc_col_df_list,axis=1) # concatenate the list of dataframes

  print(f'In {df_name}: the count of new columns created from each original column:\n',enc_col_counts)

  return X_enc_df, y_df, X_col_list, enc_col_counts
In [30]:
X_df, y_df, X_col_list, remove_col_counts = fun_enc(df, 'df')
In df: the count of new columns created from each original column:
 [3, 16, 12, 4, 4, 1, 1, 1, 1, 1]
In [31]:
X_df1, y_df1, X_col_list1, remove_col_counts1 = fun_enc(df_imb1, 'df_imb1')
X_df2, y_df2, X_col_list2, remove_col_counts2 = fun_enc(df_imb2, 'df_imb2')
In df_imb1: the count of new columns created from each original column:
 [3, 16, 12, 4, 4, 1, 1, 1, 1, 1]
In df_imb2: the count of new columns created from each original column:
 [3, 16, 12, 4, 4, 1, 1, 1, 1, 1]
In [32]:
def fun_split(df_name, X_df, y_df, train_pct, val_pct): 
    # split the predictors and the target data frame into test (1- train_pct) and 
    # train (train_pct) dataframes using the target data frame
    X_train, X_val_test, y_train, y_val_test = \
    train_test_split(X_df, y_df, train_size=train_pct, random_state=42)
    
    # split again using train_size=val_pct
    X_val, X_test, y_val, y_test = \
    train_test_split(X_val_test, y_val_test, train_size=val_pct, random_state=42)
    
    # print the shapes of the three divisions
    print(f'In {df_name}:')
    print('\n')
    print(f'Shape of X_train:', X_train.shape)
    print(f'Shape of y_train:', y_train.shape)
    print('\n')
    print(f'Shape of X_val:', X_val.shape)
    print(f'Shape of y_val:', y_val.shape)
    print('\n')
    print(f'Shape of X_test', X_test.shape)
    print(f'Shape of y_test', y_test.shape)
    print('\n')

    # Generate y_counts in train, val and test sets 
    train_y_counts = y_train.value_counts()
    val_y_counts = y_val.value_counts()
    test_y_counts = y_test.value_counts()
    
    # print y distribution in percentage for train, val and test sets
    print(f'In {df_name} train set:','\n')
    for i in y_levels:
      print(f'{round(100*train_y_counts[i]/X_train.shape[0],2)} percent is {i}','\n')
    print('\n')
    print(f'In {df_name} validation set:','\n')
    for i in y_levels:
      print(f'{round(100*val_y_counts[i]/X_val.shape[0],2)} percent is {i}','\n')
    print('\n')
    print(f'In {df_name} test set:','\n')
    for i in y_levels:
      print(f'{round(100*test_y_counts[i]/X_test.shape[0],2)} percent is {i}','\n')
    return X_train, y_train, X_val, y_val, X_test, y_test; 
    print('\n')
In [33]:
# split the original df
X_train,  y_train, X_val, y_val, X_test, y_test = fun_split('df', X_df, y_df, 0.6, 0.5)
In df:


Shape of X_train: (6000, 44)
Shape of y_train: (6000,)


Shape of X_val: (2000, 44)
Shape of y_val: (2000,)


Shape of X_test (2000, 44)
Shape of y_test (2000,)


In df train set: 

86.88 percent is No 

13.12 percent is Yes 



In df validation set: 

87.4 percent is No 

12.6 percent is Yes 



In df test set: 

87.2 percent is No 

12.8 percent is Yes 

In [34]:
# split df_imb1
X_train1,  y_train1, X_val1, y_val1, X_test1, y_test1 = fun_split('df_imb1', X_df1, y_df1, 0.6, 0.5)
In df_imb1:


Shape of X_train: (5806, 44)
Shape of y_train: (5806,)


Shape of X_val: (1935, 44)
Shape of y_val: (1935,)


Shape of X_test (1936, 44)
Shape of y_test (1936,)


In df_imb1 train set: 

89.8 percent is No 

10.2 percent is Yes 



In df_imb1 validation set: 

90.49 percent is No 

9.51 percent is Yes 



In df_imb1 test set: 

89.88 percent is No 

10.12 percent is Yes 

In [35]:
# split df_imb2
X_train2,  y_train2, X_val2, y_val2, X_test2, y_test2 = fun_split('df_imb2', X_df2, y_df2, 0.6, 0.5)
In df_imb2:


Shape of X_train: (5416, 44)
Shape of y_train: (5416,)


Shape of X_val: (1806, 44)
Shape of y_val: (1806,)


Shape of X_test (1806, 44)
Shape of y_test (1806,)


In df_imb2 train set: 

96.58 percent is No 

3.42 percent is Yes 



In df_imb2 validation set: 

95.85 percent is No 

4.15 percent is Yes 



In df_imb2 test set: 

96.51 percent is No 

3.49 percent is Yes 

Create resampled train sets using SMOTE

Don't resample the validation or test sets. Always evaluate on actual, non-resampled data.

In [36]:
# resample the train set from the original data (don't resample validate or test sets)
sm = SMOTE(random_state=42)
smk3 = SMOTE(k_neighbors=3, random_state=42)
In [37]:
X_train_sm, y_train_sm = sm.fit_resample(X_train.values, y_train)
In [38]:
print('Shapes of X_train_sm and y_train_sm:',X_train_sm.shape, y_train_sm.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train_sm).value_counts())
Shapes of X_train_sm and y_train_sm: (10426, 44) (10426,) 

Target variable instance counts:
 Yes    5213
No     5213
Name: IsBadBuy, dtype: int64
In [39]:
# resample the train sets of the two new datasets that are more imbalanced
X_train1_sm, y_train1_sm =sm.fit_resample(X_train1.values, y_train1)
X_train2_sm, y_train2_sm =sm.fit_resample(X_train2.values, y_train2)
print('Shapes of X_train1_sm and y_train1_sm:',X_train1_sm.shape, y_train1_sm.shape,'\n')
print('Instance counts by values in target variable:\n', pd.Series(y_train1_sm).value_counts())
print('\n')
print('Shapes of X_train2_sm and y_train2_sm:', X_train2_sm.shape, y_train2_sm.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train2_sm).value_counts())
Shapes of X_train1_sm and y_train1_sm: (10428, 44) (10428,) 

Instance counts by values in target variable:
 No     5214
Yes    5214
Name: IsBadBuy, dtype: int64


Shapes of X_train2_sm and y_train2_sm: (10462, 44) (10462,) 

Target variable instance counts:
 No     5231
Yes    5231
Name: IsBadBuy, dtype: int64
In [40]:
X_train_smk3, y_train_smk3 = smk3.fit_resample(X_train.values, y_train)
X_train1_smk3, y_train1_smk3 = smk3.fit_resample(X_train1.values, y_train1)
X_train2_smk3, y_train2_smk3 = smk3.fit_resample(X_train2.values, y_train2)
print('Shapes of X_train_smk3 and y_train_smk3:', X_train1_smk3.shape, y_train_smk3.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train1_smk3).value_counts())
print('\n')
print('Shapes of X_train1_smk3 and y_train1_smk3:', X_train1_smk3.shape, y_train1_smk3.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train1_smk3).value_counts())
print('\n')
print('Shapes of X_train2_smk3 and y_train2_smk3:', X_train2_smk3.shape, y_train2_smk3.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train2_smk3).value_counts())
Shapes of X_train_smk3 and y_train_smk3: (10428, 44) (10426,) 

Target variable instance counts:
 No     5214
Yes    5214
Name: IsBadBuy, dtype: int64


Shapes of X_train1_smk3 and y_train1_smk3: (10428, 44) (10428,) 

Target variable instance counts:
 No     5214
Yes    5214
Name: IsBadBuy, dtype: int64


Shapes of X_train2_smk3 and y_train2_smk3: (10462, 44) (10462,) 

Target variable instance counts:
 No     5231
Yes    5231
Name: IsBadBuy, dtype: int64

Evaluating model performance on the validation and test sets

In [41]:
# func for fitting a model on the train set, predicting on the validation and test sets, and reporting the results
def fun_clf_split_val_test4(clf, clf_name, X_train, y_train, X_val, y_val, X_test, y_test):
  # fit an estimator to train data
  model = clf.fit(X_train,y_train)
  
  # model_name = type(model).__name__
  
  # generate predictions for the target of the validation set
  pred_val = model.predict(X_val)
  
  # create a report on the validation set performance
  val_report_dict = metrics.classification_report(y_val, pred_val, labels=y_levels, output_dict=True)
  val_report_df = round(pd.DataFrame.from_dict(val_report_dict).reset_index(drop=False),2)
  val_report_df['clf_name'] = clf_name
  
  # generate predictions for the target of the test set
  pred_test = model.predict(X_test)

  # create a report on the test set performance
  test_report_dict = metrics.classification_report(y_test, pred_test, labels=y_levels,output_dict=True)
  test_report_df = round(pd.DataFrame.from_dict(test_report_dict).reset_index(drop=False),2)
  test_report_df['clf_name'] = clf_name 
  
  # bundle up validation and test results in a single row
  split_val_test_results_df = pd.DataFrame({'clf_name':[clf_name],'test_results':[test_report_df],'validate_results':[val_report_df]})
  return split_val_test_results_df
In [42]:
# run the function on the original dataset that has been resampled
sm_val_test_results_df = fun_clf_split_val_test4(DTC1,'DTC1',X_train_sm,y_train_sm,X_val,y_val,X_test,y_test)
smk3_val_test_results_df = fun_clf_split_val_test4(DTC1,'DTC1',X_train_smk3,y_train_smk3,X_val,y_val,X_test,y_test)

print('Validation results using smote on df:\n', sm_val_test_results_df['validate_results'].iloc[0],'\n')
print('Validation results using smote with 3 neighbors on df:\n', sm_val_test_results_df['validate_results'].iloc[0],'\n')
print('Test results using smote on df::\n', sm_val_test_results_df['test_results'].iloc[0],'\n')
print('Test results using smote with 3 neighbors on df1:\n', smk3_val_test_results_df['test_results'].iloc[0],'\n')
Validation results using smote on df:
        index       No     Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.90    0.65      0.89       0.78          0.87     DTC1
1     recall     0.98    0.22      0.89       0.60          0.89     DTC1
2   f1-score     0.94    0.33      0.89       0.63          0.86     DTC1
3    support  1748.00  252.00      0.89    2000.00       2000.00     DTC1 

Validation results using smote with 3 neighbors on df:
        index       No     Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.90    0.65      0.89       0.78          0.87     DTC1
1     recall     0.98    0.22      0.89       0.60          0.89     DTC1
2   f1-score     0.94    0.33      0.89       0.63          0.86     DTC1
3    support  1748.00  252.00      0.89    2000.00       2000.00     DTC1 

Test results using smote on df::
        index       No     Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.90    0.84       0.9       0.87          0.90     DTC1
1     recall     0.99    0.27       0.9       0.63          0.90     DTC1
2   f1-score     0.95    0.41       0.9       0.68          0.88     DTC1
3    support  1744.00  256.00       0.9    2000.00       2000.00     DTC1 

Test results using smote with 3 neighbors on df1:
        index       No     Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.90    0.84       0.9       0.87          0.90     DTC1
1     recall     0.99    0.27       0.9       0.63          0.90     DTC1
2   f1-score     0.95    0.41       0.9       0.68          0.88     DTC1
3    support  1744.00  256.00       0.9    2000.00       2000.00     DTC1 

In [43]:
# run the function on the first "new" imbalanced dataset that has been resampled
sm_val_test_results_df1 = fun_clf_split_val_test4(DTC1,'DTC1',X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1)
smk3_val_test_results_df1 = fun_clf_split_val_test4(DTC1,'DTC1',X_train1_smk3,y_train1_smk3,X_val1,y_val1,X_test1,y_test1)

print('Validation results using smote on df1:\n', sm_val_test_results_df1['validate_results'].iloc[0],'\n')
print('Validation results using smote with 3 neighbors on df1:\n', smk3_val_test_results_df1['validate_results'].iloc[0],'\n')
print('Test results using smote on df1:\n',sm_val_test_results_df1['test_results'].iloc[0],'\n')
print('Test results using smote with 3 neighbors on df1:\n',smk3_val_test_results_df1['test_results'].iloc[0],'\n')
Validation results using smote on df1:
        index       No     Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.93    0.68      0.92       0.80          0.90     DTC1
1     recall     0.99    0.24      0.92       0.62          0.92     DTC1
2   f1-score     0.96    0.36      0.92       0.66          0.90     DTC1
3    support  1751.00  184.00      0.92    1935.00       1935.00     DTC1 

Validation results using smote with 3 neighbors on df1:
        index       No     Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.93    0.68      0.92       0.80          0.90     DTC1
1     recall     0.99    0.24      0.92       0.62          0.92     DTC1
2   f1-score     0.96    0.36      0.92       0.66          0.90     DTC1
3    support  1751.00  184.00      0.92    1935.00       1935.00     DTC1 

Test results using smote on df1:
        index       No     Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.92    0.69      0.91       0.81          0.90     DTC1
1     recall     0.99    0.27      0.91       0.63          0.91     DTC1
2   f1-score     0.95    0.38      0.91       0.67          0.90     DTC1
3    support  1740.00  196.00      0.91    1936.00       1936.00     DTC1 

Test results using smote with 3 neighbors on df1:
        index       No     Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.92    0.69      0.91       0.81          0.90     DTC1
1     recall     0.99    0.27      0.91       0.63          0.91     DTC1
2   f1-score     0.95    0.38      0.91       0.67          0.90     DTC1
3    support  1740.00  196.00      0.91    1936.00       1936.00     DTC1 

In [44]:
# run the function on the second "new" imbalanced dataset that has been resampled
sm_val_test_results_df2 = fun_clf_split_val_test4(DTC1, 'DTC1', X_train2_sm, y_train2_sm, X_val2, y_val2, X_test2, y_test2)
smk3_val_test_results_df2 = fun_clf_split_val_test4(DTC1, 'DTC1', X_train2_smk3, y_train2_smk3, X_val2, y_val2, X_test2, y_test2)

print('Validation results using smote on df2:\n', sm_val_test_results_df2['validate_results'].iloc[0],'\n')
print('Validation results using smote with 3 neighbors on df2:\n', smk3_val_test_results_df2['validate_results'].iloc[0],'\n')
print('Test results using smote on df2:\n',sm_val_test_results_df2['test_results'].iloc[0],'\n')
print('Test results using smote with 3 neighbors on df2:\n',smk3_val_test_results_df2['test_results'].iloc[0],'\n')
Validation results using smote on df2:
        index       No    Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.97   0.57      0.96       0.77          0.95     DTC1
1     recall     0.99   0.27      0.96       0.63          0.96     DTC1
2   f1-score     0.98   0.36      0.96       0.67          0.95     DTC1
3    support  1731.00  75.00      0.96    1806.00       1806.00     DTC1 

Validation results using smote with 3 neighbors on df2:
        index       No    Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.97   0.23      0.93       0.60          0.94     DTC1
1     recall     0.96   0.29      0.93       0.63          0.93     DTC1
2   f1-score     0.96   0.26      0.93       0.61          0.93     DTC1
3    support  1731.00  75.00      0.93    1806.00       1806.00     DTC1 

Test results using smote on df2:
        index       No    Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.97   0.42      0.96       0.70          0.96     DTC1
1     recall     0.99   0.29      0.96       0.64          0.96     DTC1
2   f1-score     0.98   0.34      0.96       0.66          0.96     DTC1
3    support  1743.00  63.00      0.96    1806.00       1806.00     DTC1 

Test results using smote with 3 neighbors on df2:
        index       No    Yes  accuracy  macro avg  weighted avg clf_name
0  precision     0.98   0.20      0.93       0.59          0.95     DTC1
1     recall     0.95   0.37      0.93       0.66          0.93     DTC1
2   f1-score     0.96   0.26      0.93       0.61          0.94     DTC1
3    support  1743.00  63.00      0.93    1806.00       1806.00     DTC1 

In [45]:
# func for fitting multiple models on the train set, predicting on the validation and test sets, and reporting the results
def fun_split_val_multi_clf(clf_list,clf_name_list,X_train,y_train,X_val,y_val,X_test,y_test):
  multi_clf_results_list = []
  for i in range(0,len(clf_list)):
      clf_results_df = fun_clf_split_val_test4(clf_list[i],clf_name_list[i],X_train,y_train,X_val, y_val,X_test,y_test)
      multi_clf_results_list.append(clf_results_df)
              # force pd.concat() to create a good index
  split_val_multi_clf_results_df = pd.concat(multi_clf_results_list).reset_index(drop=True)
  return split_val_multi_clf_results_df
In [46]:
# Run cv_val_multi_clf() for resampled training data, clf_list1 and clf_name_list1
sm_multi_clf_results_df = fun_split_val_multi_clf(clf_list1, clf_name_list1,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test)
sm_multi_clf_results_df1 = fun_split_val_multi_clf(clf_list1, clf_name_list1,X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1)
smk3_multi_clf_results_df2 = fun_split_val_multi_clf(clf_list1, clf_name_list1,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2)
In [47]:
# create a function for combining all the validation reports and combining test reports for each classifier
def fun_multi_clf_reports(split_val_multi_clf_results_df):
  multi_clf_val_report_list = []
  multi_clf_test_report_list = []
  for index, model_row in split_val_multi_clf_results_df.iterrows():
    multi_clf_val_report_list.append(model_row['validate_results'])
    multi_clf_test_report_list.append(model_row['test_results'])
    # end of for loop
  multi_clf_val_report_df = pd.concat(multi_clf_val_report_list)
  multi_clf_val_report_df = multi_clf_val_report_df.reset_index(drop=False)
  multi_clf_val_report_df = multi_clf_val_report_df.rename(columns={"index": "scorer"})
  multi_clf_test_report_df = pd.concat(multi_clf_test_report_list)
  multi_clf_test_report_df = multi_clf_test_report_df.reset_index(drop=False)
  multi_clf_test_report_df = multi_clf_test_report_df.rename(columns={"index": "scorer"})
  return multi_clf_val_report_df, multi_clf_test_report_df
In [48]:
# run the "combine reports" function for each of the three variations of data
sm_multi_clf_val_report_df, sm_multi_clf_test_report_df = fun_multi_clf_reports(sm_multi_clf_results_df)
sm_multi_clf_val_report_df1, sm_multi_clf_test_report_df1 = fun_multi_clf_reports(sm_multi_clf_results_df1)
smk3_multi_clf_val_report_df2, smk3_multi_clf_test_report_df2 = fun_multi_clf_reports(smk3_multi_clf_results_df2)
In [49]:
# create a function for generating tables and graphs comparing validation and test performance
def fun_split_val_test_comparison4(sam_name,clf_name_df,multi_clf_val_report_df,multi_clf_test_report_df,comp_name):  
  # show tabular comparisons and sns.catplot() of 
  # validation and test results by metric
  for i in range(0,len(metric_map.x_label)):
    val_metric_df = multi_clf_val_report_df[multi_clf_val_report_df['scorer'].\
      str.match(metric_map.scorer.iloc[i])].reset_index(drop=True)[metric_map.col.iloc[i]]
    test_metric_df = multi_clf_test_report_df[multi_clf_test_report_df['scorer'].\
      str.match(metric_map.scorer.iloc[i])].reset_index(drop=True)[metric_map.col.iloc[i]]
    wide_df = pd.concat([clf_name_df, test_metric_df,val_metric_df],axis=1)
        # rename the columns
    wide_df.columns = ['clf_name','test_set_result','val_set_result']
    wide_df=wide_df.sort_values(by=['test_set_result'], ascending=False)
    print(metric_map.x_label.iloc[i],'from test (left) and from validation (right)\n')
    print(wide_df,'\n')
    long_df = pd.DataFrame(columns=['clf_name',metric_map.x_label.iloc[i],comp_name])
    # 
    for r in range(0, len(clf_name_df)):
      long_df = long_df.append({'clf_name':clf_name_df.iloc[r],
                            metric_map.x_label.iloc[i]: test_metric_df.iloc[r], 
                            comp_name:'test_set'},ignore_index=True)
      long_df = long_df.append({'clf_name':clf_name_df.iloc[r],
                            metric_map.x_label.iloc[i]: val_metric_df.iloc[r],
                            comp_name:'val_set'}, ignore_index=True)
    # use seaborn's catplot() to draw performance from test-set and val-set 
    # in groups of classifier
    g = sns.catplot(
      data=long_df, kind="bar",
      y="clf_name", x=metric_map.x_label.iloc[i], hue=comp_name, orient='h',
      order=long_df[long_df['data_source'].str.match('val_set')].\
      sort_values(by=metric_map.x_label.iloc[i],ascending=False)['clf_name']
      )
    g.set(xlim=(0.0, 1.0))
    g.fig.set_figwidth(10) 
    g.fig.suptitle(f'sampler {sam_name}')
In [50]:
# create a dataframe of val_metric and scorer and col mappings that will be used
# to select result score from the validation results and test results
metric_map = pd.DataFrame(columns=['scorer','col','x_label'])
metric_map = metric_map.append({'scorer' : 'precision','col':'accuracy','x_label':'overall_accuracy'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'recall','col':'Yes','x_label':'recall_yes'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'recall','col':'No', 'x_label':'recall_no'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'precision','col':'Yes','x_label':'precision_yes'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'precision','col':'No','x_label':'precision_no'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'f1-score','col':'Yes', 'x_label':'f1_yes'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'f1-score','col':'No', 'x_label':'f1_no'} , ignore_index=True)
metric_map
Out[50]:
scorer col x_label
0 precision accuracy overall_accuracy
1 recall Yes recall_yes
2 recall No recall_no
3 precision Yes precision_yes
4 precision No precision_no
5 f1-score Yes f1_yes
6 f1-score No f1_no
In [51]:
# create a series of the classifier names
clf_name_df = sm_multi_clf_test_report_df1.clf_name[sm_multi_clf_test_report_df1['scorer'].str.match('precision')].reset_index(drop=True)
clf_name_df
Out[51]:
0           DTC1
1     NB_default
2    MLP_default
3    SVC_default
4    KNN_default
Name: clf_name, dtype: object
In [52]:
comp_name = 'data-source' 
print('Comparisons for df and sm:\n')
fun_split_val_test_comparison4('SMOTE',clf_name_df,sm_multi_clf_val_report_df,sm_multi_clf_test_report_df,'data_source')
Comparisons for df and sm:

overall_accuracy from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.90            0.89
2  MLP_default             0.83            0.82
4  KNN_default             0.65            0.63
3  SVC_default             0.62            0.61
1   NB_default             0.43            0.43 

recall_yes from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
1   NB_default             0.86            0.86
3  SVC_default             0.58            0.56
4  KNN_default             0.45            0.40
0         DTC1             0.27            0.22
2  MLP_default             0.19            0.19 

recall_no from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.99            0.98
2  MLP_default             0.92            0.91
4  KNN_default             0.68            0.66
3  SVC_default             0.62            0.62
1   NB_default             0.36            0.36 

precision_yes from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.84            0.65
2  MLP_default             0.27            0.24
3  SVC_default             0.18            0.17
1   NB_default             0.17            0.16
4  KNN_default             0.17            0.15 

precision_no from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
1   NB_default             0.95            0.95
3  SVC_default             0.91            0.91
0         DTC1             0.90            0.90
2  MLP_default             0.89            0.89
4  KNN_default             0.89            0.89 

f1_yes from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.41            0.33
1   NB_default             0.28            0.27
3  SVC_default             0.28            0.27
4  KNN_default             0.24            0.22
2  MLP_default             0.22            0.21 

f1_no from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.95            0.94
2  MLP_default             0.90            0.90
4  KNN_default             0.77            0.76
3  SVC_default             0.74            0.74
1   NB_default             0.52            0.53 

In [53]:
print('Comparisons for df1 and sm:\n')
fun_split_val_test_comparison4('SMOTE',clf_name_df,sm_multi_clf_val_report_df1,sm_multi_clf_test_report_df1,'data_source')
Comparisons for df1 and sm:

overall_accuracy from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.91            0.92
2  MLP_default             0.89            0.90
4  KNN_default             0.66            0.66
3  SVC_default             0.59            0.59
1   NB_default             0.36            0.38 

recall_yes from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
1   NB_default             0.90            0.89
3  SVC_default             0.63            0.54
4  KNN_default             0.36            0.30
0         DTC1             0.27            0.24
2  MLP_default             0.01            0.03 

recall_no from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.99            0.99
2  MLP_default             0.99            0.99
4  KNN_default             0.69            0.69
3  SVC_default             0.59            0.60
1   NB_default             0.30            0.33 

precision_yes from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.69            0.68
3  SVC_default             0.15            0.12
1   NB_default             0.13            0.12
2  MLP_default             0.12            0.38
4  KNN_default             0.12            0.09 

precision_no from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
1   NB_default             0.96            0.96
3  SVC_default             0.93            0.93
0         DTC1             0.92            0.93
4  KNN_default             0.91            0.90
2  MLP_default             0.90            0.91 

f1_yes from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.38            0.36
3  SVC_default             0.24            0.20
1   NB_default             0.22            0.21
4  KNN_default             0.18            0.14
2  MLP_default             0.02            0.06 

f1_no from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.95            0.96
2  MLP_default             0.94            0.95
4  KNN_default             0.79            0.79
3  SVC_default             0.72            0.73
1   NB_default             0.46            0.49 

In [54]:
print('Comparisons for df2 and smk3:\n')
fun_split_val_test_comparison4('SMOTE_K3',clf_name_df,smk3_multi_clf_val_report_df2,smk3_multi_clf_test_report_df2,'data_source')
Comparisons for df2 and smk3:

overall_accuracy from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
2  MLP_default             0.94            0.92
0         DTC1             0.93            0.93
4  KNN_default             0.77            0.76
3  SVC_default             0.63            0.62
1   NB_default             0.27            0.27 

recall_yes from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
1   NB_default             0.94            0.92
3  SVC_default             0.68            0.60
0         DTC1             0.37            0.29
2  MLP_default             0.29            0.16
4  KNN_default             0.21            0.19 

recall_no from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
2  MLP_default             0.96            0.95
0         DTC1             0.95            0.96
4  KNN_default             0.79            0.78
3  SVC_default             0.63            0.62
1   NB_default             0.24            0.24 

precision_yes from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
2  MLP_default             0.21            0.13
0         DTC1             0.20            0.23
3  SVC_default             0.06            0.06
1   NB_default             0.04            0.05
4  KNN_default             0.03            0.04 

precision_no from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
1   NB_default             0.99            0.99
0         DTC1             0.98            0.97
3  SVC_default             0.98            0.97
2  MLP_default             0.97            0.96
4  KNN_default             0.97            0.96 

f1_yes from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
0         DTC1             0.26            0.26
2  MLP_default             0.24            0.15
3  SVC_default             0.12            0.12
1   NB_default             0.08            0.09
4  KNN_default             0.06            0.06 

f1_no from test (left) and from validation (right)

      clf_name  test_set_result  val_set_result
2  MLP_default             0.97            0.96
0         DTC1             0.96            0.96
4  KNN_default             0.87            0.86
3  SVC_default             0.77            0.76
1   NB_default             0.39            0.39 

In [55]:
# define an "all tasks" function

def fun_split_val_multi_clf_all_tasks(sam_name,clf_list, clf_name_list,X_train,y_train,X_val,y_val,X_test,y_test,comp_name):
  # func for fitting multiple models on the train set, predicting on the validation and test sets, and reporting the results
  multi_clf_results_df = fun_split_val_multi_clf(clf_list, clf_name_list,X_train,y_train,X_val,y_val,X_test,y_test)

  # combine all the validation reports and combine the test reports for each classifier
  multi_clf_val_report_df, multi_clf_test_report_df = fun_multi_clf_reports(multi_clf_results_df)

  # get a list of the classifier names
  clf_name_df = multi_clf_test_report_df.clf_name[multi_clf_test_report_df['scorer'].str.match('precision')].reset_index(drop=True)

  # generate tables and graphs comparing validation and test performance
  fun_split_val_test_comparison4(sam_name,clf_name_df,multi_clf_val_report_df,multi_clf_test_report_df,comp_name)

Check performance on different lists of classifiers

Now that we have all functions defined, we can see how different classifiers perform. We will test the classifiers grouped in lists earlier in the notebook.

Logistic regression list

In [56]:
# logistic regression list on original df
fun_split_val_multi_clf_all_tasks('SMOTE',lr_list,lr_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
overall_accuracy from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.90            0.89
6      lib_l1             0.90            0.89
4      lib_l2             0.89            0.88
5  lib_lowerC             0.88            0.87
0    lbfgs_l2             0.82            0.81
2      sag_l2             0.60            0.59
3     saga_l2             0.60            0.59 

recall_yes from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
2      sag_l2             0.61            0.60
3     saga_l2             0.61            0.60
0    lbfgs_l2             0.39            0.33
5  lib_lowerC             0.33            0.26
4      lib_l2             0.29            0.23
1   newton_l2             0.28            0.21
6      lib_l1             0.28            0.21 

recall_no from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.99            0.98
6      lib_l1             0.99            0.98
4      lib_l2             0.98            0.97
5  lib_lowerC             0.97            0.95
0    lbfgs_l2             0.88            0.87
2      sag_l2             0.60            0.59
3     saga_l2             0.60            0.59 

precision_yes from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
6      lib_l1             0.85            0.65
1   newton_l2             0.83            0.64
4      lib_l2             0.70            0.56
5  lib_lowerC             0.59            0.44
0    lbfgs_l2             0.33            0.28
2      sag_l2             0.18            0.17
3     saga_l2             0.18            0.17 

precision_no from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
0    lbfgs_l2             0.91            0.90
2      sag_l2             0.91            0.91
3     saga_l2             0.91            0.91
5  lib_lowerC             0.91            0.90
1   newton_l2             0.90            0.90
4      lib_l2             0.90            0.90
6      lib_l1             0.90            0.90 

f1_yes from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
5  lib_lowerC             0.43            0.33
1   newton_l2             0.42            0.32
6      lib_l1             0.42            0.32
4      lib_l2             0.41            0.32
0    lbfgs_l2             0.36            0.30
2      sag_l2             0.28            0.27
3     saga_l2             0.28            0.27 

f1_no from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.95            0.94
6      lib_l1             0.95            0.94
4      lib_l2             0.94            0.93
5  lib_lowerC             0.94            0.93
0    lbfgs_l2             0.90            0.89
2      sag_l2             0.72            0.72
3     saga_l2             0.72            0.72 

In [57]:
# Repeat it for CD_imb1
fun_split_val_multi_clf_all_tasks('SMOTE',lr_list,lr_name_list,X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1,'data_source')
overall_accuracy from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.91            0.92
6      lib_l1             0.91            0.92
4      lib_l2             0.90            0.91
5  lib_lowerC             0.90            0.91
0    lbfgs_l2             0.59            0.59
2      sag_l2             0.59            0.59
3     saga_l2             0.59            0.59 

recall_yes from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
0    lbfgs_l2             0.64            0.55
2      sag_l2             0.64            0.55
3     saga_l2             0.64            0.55
1   newton_l2             0.24            0.23
5  lib_lowerC             0.24            0.22
6      lib_l1             0.23            0.23
4      lib_l2             0.22            0.22 

recall_no from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.99            0.99
6      lib_l1             0.99            0.99
4      lib_l2             0.98            0.98
5  lib_lowerC             0.97            0.98
0    lbfgs_l2             0.58            0.59
2      sag_l2             0.58            0.59
3     saga_l2             0.58            0.59 

precision_yes from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.69            0.68
6      lib_l1             0.69            0.68
4      lib_l2             0.58            0.58
5  lib_lowerC             0.51            0.53
0    lbfgs_l2             0.15            0.12
2      sag_l2             0.15            0.12
3     saga_l2             0.15            0.12 

precision_no from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
0    lbfgs_l2             0.94            0.93
2      sag_l2             0.94            0.93
3     saga_l2             0.94            0.93
1   newton_l2             0.92            0.92
4      lib_l2             0.92            0.92
5  lib_lowerC             0.92            0.92
6      lib_l1             0.92            0.92 

f1_yes from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.36            0.35
6      lib_l1             0.35            0.34
5  lib_lowerC             0.33            0.31
4      lib_l2             0.32            0.32
0    lbfgs_l2             0.24            0.20
2      sag_l2             0.24            0.20
3     saga_l2             0.24            0.20 

f1_no from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.95            0.96
4      lib_l2             0.95            0.95
5  lib_lowerC             0.95            0.95
6      lib_l1             0.95            0.96
0    lbfgs_l2             0.72            0.72
2      sag_l2             0.72            0.72
3     saga_l2             0.72            0.72 

In [58]:
# Repeat it for df2
fun_split_val_multi_clf_all_tasks('SMOTE_k3',lr_list,lr_name_list,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2,'data_source')
overall_accuracy from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.96            0.96
4      lib_l2             0.96            0.96
5  lib_lowerC             0.96            0.96
6      lib_l1             0.96            0.96
0    lbfgs_l2             0.60            0.58
2      sag_l2             0.60            0.58
3     saga_l2             0.60            0.58 

recall_yes from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
0    lbfgs_l2             0.70            0.59
2      sag_l2             0.70            0.59
3     saga_l2             0.70            0.59
5  lib_lowerC             0.16            0.11
6      lib_l1             0.10            0.08
1   newton_l2             0.08            0.07
4      lib_l2             0.06            0.08 

recall_no from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
6      lib_l1             1.00            1.00
1   newton_l2             0.99            1.00
4      lib_l2             0.99            1.00
5  lib_lowerC             0.99            0.99
0    lbfgs_l2             0.60            0.58
2      sag_l2             0.60            0.58
3     saga_l2             0.60            0.58 

precision_yes from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
6      lib_l1             0.43            0.50
1   newton_l2             0.33            0.45
5  lib_lowerC             0.30            0.42
4      lib_l2             0.27            0.43
0    lbfgs_l2             0.06            0.06
2      sag_l2             0.06            0.06
3     saga_l2             0.06            0.06 

precision_no from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
0    lbfgs_l2             0.98            0.97
2      sag_l2             0.98            0.97
3     saga_l2             0.98            0.97
1   newton_l2             0.97            0.96
4      lib_l2             0.97            0.96
5  lib_lowerC             0.97            0.96
6      lib_l1             0.97            0.96 

f1_yes from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
5  lib_lowerC             0.21            0.17
6      lib_l1             0.16            0.14
1   newton_l2             0.13            0.12
0    lbfgs_l2             0.11            0.10
2      sag_l2             0.11            0.10
3     saga_l2             0.11            0.10
4      lib_l2             0.10            0.13 

f1_no from test (left) and from validation (right)

     clf_name  test_set_result  val_set_result
1   newton_l2             0.98            0.98
4      lib_l2             0.98            0.98
5  lib_lowerC             0.98            0.98
6      lib_l1             0.98            0.98
0    lbfgs_l2             0.74            0.73
2      sag_l2             0.74            0.73
3     saga_l2             0.74            0.73 

Bagging list

In [59]:
# bagging list on original df
fun_split_val_multi_clf_all_tasks('SMOTE',bag_list,bag_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
overall_accuracy from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
0  Bagging_default             0.88            0.88
1       Bagging_20             0.88            0.88
3       Bagging_lr             0.79            0.77
2      Bagging_SVC             0.63            0.63 

recall_yes from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
2      Bagging_SVC             0.57            0.54
3       Bagging_lr             0.48            0.42
1       Bagging_20             0.27            0.23
0  Bagging_default             0.26            0.24 

recall_no from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
0  Bagging_default             0.97            0.97
1       Bagging_20             0.97            0.97
3       Bagging_lr             0.84            0.82
2      Bagging_SVC             0.64            0.64 

precision_yes from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
1       Bagging_20             0.59            0.52
0  Bagging_default             0.55            0.51
3       Bagging_lr             0.30            0.25
2      Bagging_SVC             0.19            0.18 

precision_no from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
3       Bagging_lr             0.92            0.91
2      Bagging_SVC             0.91            0.91
0  Bagging_default             0.90            0.90
1       Bagging_20             0.90            0.90 

f1_yes from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
1       Bagging_20             0.37            0.32
3       Bagging_lr             0.37            0.32
0  Bagging_default             0.35            0.33
2      Bagging_SVC             0.28            0.27 

f1_no from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
1       Bagging_20             0.94            0.93
0  Bagging_default             0.93            0.93
3       Bagging_lr             0.88            0.86
2      Bagging_SVC             0.75            0.75 

In [60]:
# bagging list on df1
fun_split_val_multi_clf_all_tasks('SMOTE',bag_list,bag_name_list,X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1,'data_source')
overall_accuracy from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
1       Bagging_20             0.91            0.90
0  Bagging_default             0.90            0.90
3       Bagging_lr             0.82            0.83
2      Bagging_SVC             0.59            0.59 

recall_yes from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
2      Bagging_SVC             0.63            0.54
3       Bagging_lr             0.36            0.34
0  Bagging_default             0.28            0.20
1       Bagging_20             0.28            0.20 

recall_no from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
0  Bagging_default             0.98            0.97
1       Bagging_20             0.98            0.98
3       Bagging_lr             0.88            0.88
2      Bagging_SVC             0.59            0.60 

precision_yes from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
1       Bagging_20             0.64            0.49
0  Bagging_default             0.56            0.42
3       Bagging_lr             0.25            0.23
2      Bagging_SVC             0.15            0.12 

precision_no from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
2      Bagging_SVC             0.93            0.93
0  Bagging_default             0.92            0.92
1       Bagging_20             0.92            0.92
3       Bagging_lr             0.92            0.93 

f1_yes from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
1       Bagging_20             0.38            0.28
0  Bagging_default             0.37            0.27
3       Bagging_lr             0.29            0.28
2      Bagging_SVC             0.24            0.20 

f1_no from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
0  Bagging_default             0.95            0.95
1       Bagging_20             0.95            0.95
3       Bagging_lr             0.90            0.90
2      Bagging_SVC             0.72            0.73 

In [61]:
# bagging list on df2
fun_split_val_multi_clf_all_tasks('SMOTE_k3',bag_list,bag_name_list,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2,'data_source')
overall_accuracy from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
0  Bagging_default             0.96            0.96
1       Bagging_20             0.96            0.95
3       Bagging_lr             0.88            0.86
2      Bagging_SVC             0.64            0.62 

recall_yes from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
2      Bagging_SVC             0.68            0.61
3       Bagging_lr             0.54            0.28
1       Bagging_20             0.24            0.16
0  Bagging_default             0.19            0.13 

recall_no from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
0  Bagging_default             0.99            0.99
1       Bagging_20             0.99            0.99
3       Bagging_lr             0.89            0.88
2      Bagging_SVC             0.64            0.62 

precision_yes from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
1       Bagging_20             0.45            0.39
0  Bagging_default             0.36            0.38
3       Bagging_lr             0.15            0.10
2      Bagging_SVC             0.06            0.07 

precision_no from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
2      Bagging_SVC             0.98            0.97
3       Bagging_lr             0.98            0.97
0  Bagging_default             0.97            0.96
1       Bagging_20             0.97            0.96 

f1_yes from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
1       Bagging_20             0.31            0.23
0  Bagging_default             0.25            0.20
3       Bagging_lr             0.24            0.14
2      Bagging_SVC             0.12            0.12 

f1_no from test (left) and from validation (right)

          clf_name  test_set_result  val_set_result
0  Bagging_default             0.98            0.98
1       Bagging_20             0.98            0.98
3       Bagging_lr             0.93            0.92
2      Bagging_SVC             0.77            0.76 

Boosting list

In [62]:
# boosting list on original df
fun_split_val_multi_clf_all_tasks('SMOTE',boost_list,boost_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
overall_accuracy from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
1     Ada_dt_halflearning             0.90            0.88
0             Ada_default             0.89            0.88
2               Ada_lr_15             0.79            0.77
3  Ada_lr_15_halflearning             0.67            0.66 

recall_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
3  Ada_lr_15_halflearning             0.59            0.54
2               Ada_lr_15             0.48            0.42
1     Ada_dt_halflearning             0.31            0.22
0             Ada_default             0.27            0.21 

recall_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0             Ada_default             0.98            0.98
1     Ada_dt_halflearning             0.98            0.98
2               Ada_lr_15             0.83            0.82
3  Ada_lr_15_halflearning             0.68            0.68 

precision_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
1     Ada_dt_halflearning             0.72            0.58
0             Ada_default             0.66            0.56
2               Ada_lr_15             0.30            0.25
3  Ada_lr_15_halflearning             0.21            0.19 

precision_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
2               Ada_lr_15             0.92            0.91
3  Ada_lr_15_halflearning             0.92            0.91
1     Ada_dt_halflearning             0.91            0.90
0             Ada_default             0.90            0.90 

f1_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
1     Ada_dt_halflearning             0.43            0.32
0             Ada_default             0.39            0.31
2               Ada_lr_15             0.37            0.31
3  Ada_lr_15_halflearning             0.31            0.28 

f1_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0             Ada_default             0.94            0.93
1     Ada_dt_halflearning             0.94            0.94
2               Ada_lr_15             0.87            0.86
3  Ada_lr_15_halflearning             0.78            0.78 

In [63]:
# boosting list on df1
fun_split_val_multi_clf_all_tasks('SMOTE_k3',boost_list,boost_name_list,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2,'data_source')
overall_accuracy from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0             Ada_default             0.96            0.95
1     Ada_dt_halflearning             0.95            0.95
2               Ada_lr_15             0.87            0.86
3  Ada_lr_15_halflearning             0.71            0.71 

recall_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
3  Ada_lr_15_halflearning             0.65            0.55
2               Ada_lr_15             0.54            0.31
0             Ada_default             0.22            0.13
1     Ada_dt_halflearning             0.22            0.17 

recall_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0             Ada_default             0.99            0.99
1     Ada_dt_halflearning             0.98            0.98
2               Ada_lr_15             0.88            0.88
3  Ada_lr_15_halflearning             0.72            0.72 

precision_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0             Ada_default             0.38            0.37
1     Ada_dt_halflearning             0.25            0.32
2               Ada_lr_15             0.14            0.10
3  Ada_lr_15_halflearning             0.08            0.08 

precision_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
2               Ada_lr_15             0.98            0.97
3  Ada_lr_15_halflearning             0.98            0.97
0             Ada_default             0.97            0.96
1     Ada_dt_halflearning             0.97            0.96 

f1_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0             Ada_default             0.28            0.20
1     Ada_dt_halflearning             0.24            0.23
2               Ada_lr_15             0.23            0.15
3  Ada_lr_15_halflearning             0.14            0.14 

f1_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0             Ada_default             0.98            0.98
1     Ada_dt_halflearning             0.97            0.97
2               Ada_lr_15             0.93            0.92
3  Ada_lr_15_halflearning             0.83            0.83 

In [64]:
# boosting list on df2
fun_split_val_multi_clf_all_tasks('SMOTE',rf_list,rf_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
overall_accuracy from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.89            0.89
1  rf_entropy_default             0.89            0.88
2               rf_50             0.89            0.88
3       rf_entropy_50             0.89            0.88
4             rf_max7             0.89            0.87
5     rf_entropy_max7             0.89            0.88 

recall_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
4             rf_max7             0.29            0.26
5     rf_entropy_max7             0.29            0.25
1  rf_entropy_default             0.28            0.23
3       rf_entropy_50             0.28            0.23
0          rf_default             0.27            0.24
2               rf_50             0.27            0.22 

recall_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.98            0.98
1  rf_entropy_default             0.98            0.98
2               rf_50             0.98            0.98
3       rf_entropy_50             0.98            0.97
4             rf_max7             0.98            0.96
5     rf_entropy_max7             0.98            0.97 

precision_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
5     rf_entropy_max7             0.70            0.55
2               rf_50             0.67            0.61
4             rf_max7             0.67            0.50
0          rf_default             0.66            0.62
1  rf_entropy_default             0.65            0.60
3       rf_entropy_50             0.65            0.56 

precision_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default              0.9             0.9
1  rf_entropy_default              0.9             0.9
2               rf_50              0.9             0.9
3       rf_entropy_50              0.9             0.9
4             rf_max7              0.9             0.9
5     rf_entropy_max7              0.9             0.9 

f1_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
4             rf_max7             0.41            0.34
5     rf_entropy_max7             0.41            0.34
0          rf_default             0.39            0.35
1  rf_entropy_default             0.39            0.33
2               rf_50             0.39            0.33
3       rf_entropy_50             0.39            0.32 

f1_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.94            0.94
1  rf_entropy_default             0.94            0.94
2               rf_50             0.94            0.94
3       rf_entropy_50             0.94            0.93
4             rf_max7             0.94            0.93
5     rf_entropy_max7             0.94            0.93 

Random forest list

In [65]:
# logistic regression list original df
fun_split_val_multi_clf_all_tasks('SMOTE',rf_list,rf_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
overall_accuracy from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.89            0.89
1  rf_entropy_default             0.89            0.88
2               rf_50             0.89            0.88
3       rf_entropy_50             0.89            0.88
4             rf_max7             0.89            0.87
5     rf_entropy_max7             0.89            0.88 

recall_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
4             rf_max7             0.29            0.26
5     rf_entropy_max7             0.29            0.25
1  rf_entropy_default             0.28            0.23
3       rf_entropy_50             0.28            0.23
0          rf_default             0.27            0.24
2               rf_50             0.27            0.22 

recall_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.98            0.98
1  rf_entropy_default             0.98            0.98
2               rf_50             0.98            0.98
3       rf_entropy_50             0.98            0.97
4             rf_max7             0.98            0.96
5     rf_entropy_max7             0.98            0.97 

precision_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
5     rf_entropy_max7             0.70            0.55
2               rf_50             0.67            0.61
4             rf_max7             0.67            0.50
0          rf_default             0.66            0.62
1  rf_entropy_default             0.65            0.60
3       rf_entropy_50             0.65            0.56 

precision_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default              0.9             0.9
1  rf_entropy_default              0.9             0.9
2               rf_50              0.9             0.9
3       rf_entropy_50              0.9             0.9
4             rf_max7              0.9             0.9
5     rf_entropy_max7              0.9             0.9 

f1_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
4             rf_max7             0.41            0.34
5     rf_entropy_max7             0.41            0.34
0          rf_default             0.39            0.35
1  rf_entropy_default             0.39            0.33
2               rf_50             0.39            0.33
3       rf_entropy_50             0.39            0.32 

f1_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.94            0.94
1  rf_entropy_default             0.94            0.94
2               rf_50             0.94            0.94
3       rf_entropy_50             0.94            0.93
4             rf_max7             0.94            0.93
5     rf_entropy_max7             0.94            0.93 

In [66]:
# logistic regression list on df1
fun_split_val_multi_clf_all_tasks('SMOTE',rf_list,rf_name_list,X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1,'data_source')
overall_accuracy from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
1  rf_entropy_default             0.92            0.91
3       rf_entropy_50             0.92            0.91
0          rf_default             0.91            0.91
2               rf_50             0.91            0.91
4             rf_max7             0.91            0.91
5     rf_entropy_max7             0.91            0.91 

recall_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
4             rf_max7             0.28            0.24
5     rf_entropy_max7             0.28            0.24
1  rf_entropy_default             0.27            0.22
0          rf_default             0.26            0.23
3       rf_entropy_50             0.26            0.22
2               rf_50             0.24            0.23 

recall_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.99            0.98
1  rf_entropy_default             0.99            0.99
2               rf_50             0.99            0.98
3       rf_entropy_50             0.99            0.99
4             rf_max7             0.98            0.98
5     rf_entropy_max7             0.98            0.98 

precision_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
3       rf_entropy_50             0.77            0.63
1  rf_entropy_default             0.75            0.64
0          rf_default             0.70            0.61
2               rf_50             0.70            0.60
5     rf_entropy_max7             0.60            0.57
4             rf_max7             0.57            0.55 

precision_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.92            0.92
1  rf_entropy_default             0.92            0.92
2               rf_50             0.92            0.92
3       rf_entropy_50             0.92            0.92
4             rf_max7             0.92            0.92
5     rf_entropy_max7             0.92            0.93 

f1_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
1  rf_entropy_default             0.40            0.33
3       rf_entropy_50             0.39            0.33
0          rf_default             0.38            0.33
5     rf_entropy_max7             0.38            0.34
4             rf_max7             0.37            0.34
2               rf_50             0.36            0.33 

f1_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
1  rf_entropy_default             0.96            0.95
3       rf_entropy_50             0.96            0.95
0          rf_default             0.95            0.95
2               rf_50             0.95            0.95
4             rf_max7             0.95            0.95
5     rf_entropy_max7             0.95            0.95 

In [67]:
# logistic regression list on df2
fun_split_val_multi_clf_all_tasks('SMOTE_k3',rf_list,rf_name_list,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2,'data_source')
overall_accuracy from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.97            0.96
1  rf_entropy_default             0.97            0.96
2               rf_50             0.97            0.96
3       rf_entropy_50             0.97            0.96
4             rf_max7             0.96            0.95
5     rf_entropy_max7             0.96            0.96 

recall_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
5     rf_entropy_max7             0.30            0.19
4             rf_max7             0.27            0.15
2               rf_50             0.17            0.13
3       rf_entropy_50             0.17            0.11
0          rf_default             0.16            0.11
1  rf_entropy_default             0.16            0.11 

recall_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             1.00            1.00
1  rf_entropy_default             1.00            1.00
2               rf_50             1.00            1.00
3       rf_entropy_50             1.00            1.00
4             rf_max7             0.99            0.99
5     rf_entropy_max7             0.98            0.99 

precision_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
3       rf_entropy_50             0.73            0.73
1  rf_entropy_default             0.71            0.80
2               rf_50             0.69            0.91
0          rf_default             0.62            0.89
4             rf_max7             0.40            0.37
5     rf_entropy_max7             0.40            0.41 

precision_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.97            0.96
1  rf_entropy_default             0.97            0.96
2               rf_50             0.97            0.96
3       rf_entropy_50             0.97            0.96
4             rf_max7             0.97            0.96
5     rf_entropy_max7             0.97            0.97 

f1_yes from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
5     rf_entropy_max7             0.35            0.26
4             rf_max7             0.32            0.21
2               rf_50             0.28            0.23
3       rf_entropy_50             0.28            0.19
1  rf_entropy_default             0.26            0.19
0          rf_default             0.25            0.19 

f1_no from test (left) and from validation (right)

             clf_name  test_set_result  val_set_result
0          rf_default             0.98            0.98
1  rf_entropy_default             0.98            0.98
2               rf_50             0.98            0.98
3       rf_entropy_50             0.98            0.98
4             rf_max7             0.98            0.98
5     rf_entropy_max7             0.98            0.98 

Evaluate different sampling strategies

We can now run the "all tasks" function for multiple oversampling objects, undersampling objects, and objects that combine the two sampling strategies.

In [68]:
# create a new version of the "all tasks" function that evaluates different resampling techniques
def fun_multi_clf_sampler_all_tasks(sampler_list,sampler_name_list,clf_list, clf_name_list,X_train,y_train,X_val, y_val,X_test,y_test,comp_name):
  for i in range(0, len(sampler_list)):
    # assign each sampler, and its name to a variable
    sam = sampler_list[i]
    sam_name = sampler_name_list[i]

    # perform the resampling on the training set 
    X_train_sam, y_train_sam = sam.fit_resample(X_train, y_train)

    # display the value counts after resampling
    print('Sampler: ',sam,'\n')
    print('y value counts of resampled train set\n',pd.Series(y_train_sam).value_counts(),'\n')

    # func for fitting multiple models on the train set, predicting on the validation and test sets, and reporting the results
    multi_clf_results_df = fun_split_val_multi_clf(clf_list, clf_name_list,X_train_sam,y_train_sam,X_val,y_val,X_test,y_test)

    # combine all the validation reports and combine the test reports for each classifier
    multi_clf_val_report_df, multi_clf_test_report_df = fun_multi_clf_reports(multi_clf_results_df)

    # get a list of the classifier names
    clf_name_df = multi_clf_test_report_df.clf_name[multi_clf_test_report_df['scorer'].str.match('precision')].reset_index(drop=True)

    # generate tables and graphs comparing validation and test performance
    fun_split_val_test_comparison4(sam_name,clf_name_df,multi_clf_val_report_df,multi_clf_test_report_df,comp_name)

Evaluating oversampling techniques on the three datasets

In [69]:
# Evaluate oversampling techniques on original df
print('For df:\n')
fun_multi_clf_sampler_all_tasks(osampler_list,osampler_name_list,clf_list4,clf_name_list4,X_train.values,y_train,X_val,y_val,X_test,y_test,'data_source')
For df:

Sampler:  RandomOverSampler(random_state=42) 

y value counts of resampled train set
 Yes    5213
No     5213
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.88            0.87
2     lr_lib_l1_lowerc             0.75            0.74
5              rf_max7             0.75            0.76
0                 DTC1             0.70            0.71
4  Ada_lr_15_halflearn             0.62            0.62
1          SVC_default             0.61            0.61 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.61            0.59
4  Ada_lr_15_halflearn             0.61            0.59
2     lr_lib_l1_lowerc             0.59            0.57
1          SVC_default             0.58            0.57
5              rf_max7             0.57            0.55
3           Bagging_20             0.30            0.25 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.96            0.96
5              rf_max7             0.78            0.79
2     lr_lib_l1_lowerc             0.77            0.76
0                 DTC1             0.72            0.72
1          SVC_default             0.62            0.62
4  Ada_lr_15_halflearn             0.62            0.63 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.55            0.48
5              rf_max7             0.28            0.27
2     lr_lib_l1_lowerc             0.27            0.26
0                 DTC1             0.24            0.24
4  Ada_lr_15_halflearn             0.19            0.19
1          SVC_default             0.18            0.18 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.93            0.92
2     lr_lib_l1_lowerc             0.93            0.92
4  Ada_lr_15_halflearn             0.92            0.91
5              rf_max7             0.92            0.92
1          SVC_default             0.91            0.91
3           Bagging_20             0.90            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.39            0.33
2     lr_lib_l1_lowerc             0.37            0.35
5              rf_max7             0.37            0.36
0                 DTC1             0.35            0.34
4  Ada_lr_15_halflearn             0.29            0.28
1          SVC_default             0.28            0.27 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.93            0.93
5              rf_max7             0.85            0.85
2     lr_lib_l1_lowerc             0.84            0.84
0                 DTC1             0.81            0.81
4  Ada_lr_15_halflearn             0.74            0.74
1          SVC_default             0.73            0.73 

Sampler:  ADASYN(random_state=42) 

y value counts of resampled train set
 Yes    5219
No     5213
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.90            0.89
2     lr_lib_l1_lowerc             0.89            0.88
5              rf_max7             0.89            0.88
3           Bagging_20             0.88            0.88
4  Ada_lr_15_halflearn             0.68            0.67
1          SVC_default             0.59            0.58 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.62            0.62
4  Ada_lr_15_halflearn             0.58            0.53
2     lr_lib_l1_lowerc             0.29            0.22
3           Bagging_20             0.29            0.23
5              rf_max7             0.28            0.24
0                 DTC1             0.27            0.22 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.98
2     lr_lib_l1_lowerc             0.98            0.98
5              rf_max7             0.98            0.98
3           Bagging_20             0.97            0.97
4  Ada_lr_15_halflearn             0.70            0.69
1          SVC_default             0.58            0.58 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.84            0.65
2     lr_lib_l1_lowerc             0.71            0.59
5              rf_max7             0.70            0.59
3           Bagging_20             0.60            0.52
4  Ada_lr_15_halflearn             0.22            0.20
1          SVC_default             0.18            0.18 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.92            0.91
1          SVC_default             0.91            0.91
0                 DTC1             0.90            0.90
2     lr_lib_l1_lowerc             0.90            0.90
3           Bagging_20             0.90            0.90
5              rf_max7             0.90            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.41            0.33
2     lr_lib_l1_lowerc             0.41            0.32
5              rf_max7             0.40            0.34
3           Bagging_20             0.39            0.32
4  Ada_lr_15_halflearn             0.32            0.29
1          SVC_default             0.28            0.27 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.94
2     lr_lib_l1_lowerc             0.94            0.94
3           Bagging_20             0.94            0.93
5              rf_max7             0.94            0.94
4  Ada_lr_15_halflearn             0.79            0.79
1          SVC_default             0.71            0.71 

Sampler:  SMOTE(random_state=42) 

y value counts of resampled train set
 Yes    5213
No     5213
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.90            0.89
2     lr_lib_l1_lowerc             0.89            0.88
5              rf_max7             0.89            0.87
3           Bagging_20             0.88            0.88
4  Ada_lr_15_halflearn             0.67            0.66
1          SVC_default             0.62            0.61 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.59            0.54
1          SVC_default             0.58            0.56
2     lr_lib_l1_lowerc             0.29            0.22
5              rf_max7             0.29            0.26
0                 DTC1             0.27            0.22
3           Bagging_20             0.27            0.23 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.98
2     lr_lib_l1_lowerc             0.98            0.98
5              rf_max7             0.98            0.96
3           Bagging_20             0.97            0.97
4  Ada_lr_15_halflearn             0.68            0.68
1          SVC_default             0.62            0.62 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.84            0.65
2     lr_lib_l1_lowerc             0.70            0.59
5              rf_max7             0.67            0.50
3           Bagging_20             0.59            0.52
4  Ada_lr_15_halflearn             0.21            0.19
1          SVC_default             0.18            0.17 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.92            0.91
1          SVC_default             0.91            0.91
0                 DTC1             0.90            0.90
2     lr_lib_l1_lowerc             0.90            0.90
3           Bagging_20             0.90            0.90
5              rf_max7             0.90            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.41            0.33
2     lr_lib_l1_lowerc             0.41            0.32
5              rf_max7             0.41            0.34
3           Bagging_20             0.37            0.32
4  Ada_lr_15_halflearn             0.31            0.28
1          SVC_default             0.28            0.27 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.94
2     lr_lib_l1_lowerc             0.94            0.94
3           Bagging_20             0.94            0.93
5              rf_max7             0.94            0.93
4  Ada_lr_15_halflearn             0.78            0.78
1          SVC_default             0.74            0.74 

Sampler:  SVMSMOTE(random_state=42) 

y value counts of resampled train set
 Yes    5213
No     5213
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.89            0.88
5              rf_max7             0.89            0.87
3           Bagging_20             0.88            0.88
2     lr_lib_l1_lowerc             0.87            0.87
1          SVC_default             0.80            0.80
4  Ada_lr_15_halflearn             0.76            0.76 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.40            0.31
5              rf_max7             0.34            0.27
2     lr_lib_l1_lowerc             0.33            0.26
0                 DTC1             0.27            0.23
3           Bagging_20             0.26            0.25
1          SVC_default             0.21            0.18 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.98
3           Bagging_20             0.97            0.97
5              rf_max7             0.97            0.96
2     lr_lib_l1_lowerc             0.95            0.95
1          SVC_default             0.89            0.89
4  Ada_lr_15_halflearn             0.81            0.82 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.74            0.61
5              rf_max7             0.66            0.49
3           Bagging_20             0.58            0.52
2     lr_lib_l1_lowerc             0.50            0.44
4  Ada_lr_15_halflearn             0.24            0.20
1          SVC_default             0.22            0.19 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.91            0.90
5              rf_max7             0.91            0.90
0                 DTC1             0.90            0.90
3           Bagging_20             0.90            0.90
4  Ada_lr_15_halflearn             0.90            0.89
1          SVC_default             0.88            0.88 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.45            0.35
0                 DTC1             0.40            0.34
2     lr_lib_l1_lowerc             0.40            0.33
3           Bagging_20             0.36            0.34
4  Ada_lr_15_halflearn             0.30            0.24
1          SVC_default             0.21            0.19 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.94            0.94
5              rf_max7             0.94            0.93
2     lr_lib_l1_lowerc             0.93            0.93
3           Bagging_20             0.93            0.93
1          SVC_default             0.89            0.89
4  Ada_lr_15_halflearn             0.86            0.86 

Sampler:  BorderlineSMOTE(random_state=42) 

y value counts of resampled train set
 Yes    5213
No     5213
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.90            0.89
2     lr_lib_l1_lowerc             0.89            0.88
5              rf_max7             0.89            0.87
3           Bagging_20             0.88            0.88
4  Ada_lr_15_halflearn             0.69            0.68
1          SVC_default             0.62            0.61 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.58            0.56
4  Ada_lr_15_halflearn             0.57            0.52
5              rf_max7             0.32            0.29
2     lr_lib_l1_lowerc             0.29            0.22
0                 DTC1             0.27            0.22
3           Bagging_20             0.27            0.24 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.98
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.97            0.97
5              rf_max7             0.97            0.95
4  Ada_lr_15_halflearn             0.71            0.70
1          SVC_default             0.62            0.62 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.83            0.65
2     lr_lib_l1_lowerc             0.70            0.57
5              rf_max7             0.60            0.46
3           Bagging_20             0.59            0.54
4  Ada_lr_15_halflearn             0.22            0.20
1          SVC_default             0.18            0.17 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.92            0.91
1          SVC_default             0.91            0.91
5              rf_max7             0.91            0.90
0                 DTC1             0.90            0.90
2     lr_lib_l1_lowerc             0.90            0.90
3           Bagging_20             0.90            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.42            0.35
0                 DTC1             0.41            0.33
2     lr_lib_l1_lowerc             0.41            0.32
3           Bagging_20             0.37            0.33
4  Ada_lr_15_halflearn             0.32            0.29
1          SVC_default             0.28            0.27 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.94
2     lr_lib_l1_lowerc             0.94            0.93
3           Bagging_20             0.94            0.93
5              rf_max7             0.94            0.93
4  Ada_lr_15_halflearn             0.80            0.79
1          SVC_default             0.74            0.74 

In [70]:
# Evaluate oversampling techniques on df1
print('For df1:\n')
fun_multi_clf_sampler_all_tasks(osampler_list,osampler_name_list,clf_list4,clf_name_list4,X_train1.values,y_train1,X_val1,y_val1,X_test1,y_test1,'data_source')
For df1:

Sampler:  RandomOverSampler(random_state=42) 

y value counts of resampled train set
 No     5214
Yes    5214
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.90            0.90
5              rf_max7             0.79            0.79
2     lr_lib_l1_lowerc             0.75            0.75
0                 DTC1             0.66            0.66
1          SVC_default             0.61            0.61
4  Ada_lr_15_halflearn             0.61            0.60 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.68            0.60
4  Ada_lr_15_halflearn             0.65            0.62
2     lr_lib_l1_lowerc             0.62            0.57
1          SVC_default             0.61            0.54
5              rf_max7             0.57            0.55
3           Bagging_20             0.26            0.20 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.97            0.97
5              rf_max7             0.81            0.81
2     lr_lib_l1_lowerc             0.77            0.77
0                 DTC1             0.66            0.67
1          SVC_default             0.61            0.62
4  Ada_lr_15_halflearn             0.60            0.60 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.52            0.40
5              rf_max7             0.26            0.24
2     lr_lib_l1_lowerc             0.23            0.21
0                 DTC1             0.18            0.16
4  Ada_lr_15_halflearn             0.16            0.14
1          SVC_default             0.15            0.13 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.94
2     lr_lib_l1_lowerc             0.95            0.94
4  Ada_lr_15_halflearn             0.94            0.94
5              rf_max7             0.94            0.94
1          SVC_default             0.93            0.93
3           Bagging_20             0.92            0.92 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.35            0.26
5              rf_max7             0.35            0.33
2     lr_lib_l1_lowerc             0.33            0.30
0                 DTC1             0.29            0.25
4  Ada_lr_15_halflearn             0.25            0.23
1          SVC_default             0.24            0.21 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.95            0.94
5              rf_max7             0.87            0.87
2     lr_lib_l1_lowerc             0.85            0.85
0                 DTC1             0.78            0.78
1          SVC_default             0.74            0.74
4  Ada_lr_15_halflearn             0.73            0.73 

Sampler:  ADASYN(random_state=42) 

y value counts of resampled train set
 No     5214
Yes    5208
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.91            0.92
2     lr_lib_l1_lowerc             0.91            0.91
3           Bagging_20             0.91            0.91
5              rf_max7             0.91            0.91
4  Ada_lr_15_halflearn             0.69            0.69
1          SVC_default             0.58            0.58 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.64            0.57
4  Ada_lr_15_halflearn             0.58            0.51
5              rf_max7             0.28            0.24
0                 DTC1             0.27            0.24
3           Bagging_20             0.27            0.22
2     lr_lib_l1_lowerc             0.24            0.22 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.99
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
5              rf_max7             0.98            0.98
4  Ada_lr_15_halflearn             0.71            0.70
1          SVC_default             0.57            0.58 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.69            0.68
2     lr_lib_l1_lowerc             0.64            0.60
5              rf_max7             0.59            0.57
3           Bagging_20             0.58            0.52
4  Ada_lr_15_halflearn             0.18            0.15
1          SVC_default             0.14            0.13 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.94            0.93
1          SVC_default             0.93            0.93
0                 DTC1             0.92            0.93
2     lr_lib_l1_lowerc             0.92            0.92
3           Bagging_20             0.92            0.92
5              rf_max7             0.92            0.92 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.38            0.36
5              rf_max7             0.38            0.34
3           Bagging_20             0.37            0.31
2     lr_lib_l1_lowerc             0.35            0.33
4  Ada_lr_15_halflearn             0.28            0.23
1          SVC_default             0.24            0.21 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.96
2     lr_lib_l1_lowerc             0.95            0.95
3           Bagging_20             0.95            0.95
5              rf_max7             0.95            0.95
4  Ada_lr_15_halflearn             0.81            0.80
1          SVC_default             0.71            0.72 

Sampler:  SMOTE(random_state=42) 

y value counts of resampled train set
 No     5214
Yes    5214
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.91            0.92
2     lr_lib_l1_lowerc             0.91            0.91
3           Bagging_20             0.91            0.90
5              rf_max7             0.91            0.91
4  Ada_lr_15_halflearn             0.68            0.67
1          SVC_default             0.59            0.59 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.63            0.54
4  Ada_lr_15_halflearn             0.59            0.51
3           Bagging_20             0.28            0.20
5              rf_max7             0.28            0.24
0                 DTC1             0.27            0.24
2     lr_lib_l1_lowerc             0.22            0.22 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.99
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
5              rf_max7             0.98            0.98
4  Ada_lr_15_halflearn             0.69            0.69
1          SVC_default             0.59            0.60 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.69            0.68
3           Bagging_20             0.64            0.49
2     lr_lib_l1_lowerc             0.61            0.60
5              rf_max7             0.57            0.55
4  Ada_lr_15_halflearn             0.18            0.15
1          SVC_default             0.15            0.12 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.94            0.93
1          SVC_default             0.93            0.93
0                 DTC1             0.92            0.93
2     lr_lib_l1_lowerc             0.92            0.92
3           Bagging_20             0.92            0.92
5              rf_max7             0.92            0.92 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.38            0.36
3           Bagging_20             0.38            0.28
5              rf_max7             0.37            0.34
2     lr_lib_l1_lowerc             0.33            0.33
4  Ada_lr_15_halflearn             0.27            0.23
1          SVC_default             0.24            0.20 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.96
2     lr_lib_l1_lowerc             0.95            0.95
3           Bagging_20             0.95            0.95
5              rf_max7             0.95            0.95
4  Ada_lr_15_halflearn             0.79            0.79
1          SVC_default             0.72            0.73 

Sampler:  SVMSMOTE(random_state=42) 

y value counts of resampled train set
 No     5214
Yes    3134
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.91            0.92
2     lr_lib_l1_lowerc             0.91            0.91
3           Bagging_20             0.91            0.91
5              rf_max7             0.91            0.92
1          SVC_default             0.88            0.88
4  Ada_lr_15_halflearn             0.84            0.84 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.31            0.25
0                 DTC1             0.27            0.24
3           Bagging_20             0.27            0.22
5              rf_max7             0.26            0.23
2     lr_lib_l1_lowerc             0.22            0.21
1          SVC_default             0.06            0.07 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.99
2     lr_lib_l1_lowerc             0.99            0.99
3           Bagging_20             0.99            0.98
5              rf_max7             0.99            0.99
1          SVC_default             0.97            0.97
4  Ada_lr_15_halflearn             0.89            0.90 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.69            0.68
5              rf_max7             0.69            0.75
3           Bagging_20             0.68            0.53
2     lr_lib_l1_lowerc             0.67            0.64
4  Ada_lr_15_halflearn             0.25            0.21
1          SVC_default             0.18            0.18 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.92            0.93
2     lr_lib_l1_lowerc             0.92            0.92
3           Bagging_20             0.92            0.92
4  Ada_lr_15_halflearn             0.92            0.92
5              rf_max7             0.92            0.92
1          SVC_default             0.90            0.91 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.39            0.31
0                 DTC1             0.38            0.36
5              rf_max7             0.37            0.36
2     lr_lib_l1_lowerc             0.34            0.31
4  Ada_lr_15_halflearn             0.27            0.23
1          SVC_default             0.09            0.10 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.96
2     lr_lib_l1_lowerc             0.95            0.95
3           Bagging_20             0.95            0.95
5              rf_max7             0.95            0.96
1          SVC_default             0.93            0.94
4  Ada_lr_15_halflearn             0.91            0.91 

Sampler:  BorderlineSMOTE(random_state=42) 

y value counts of resampled train set
 No     5214
Yes    5214
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.92            0.90
0                 DTC1             0.91            0.92
2     lr_lib_l1_lowerc             0.91            0.91
5              rf_max7             0.90            0.91
4  Ada_lr_15_halflearn             0.67            0.67
1          SVC_default             0.60            0.61 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.61            0.54
4  Ada_lr_15_halflearn             0.58            0.50
5              rf_max7             0.29            0.26
3           Bagging_20             0.28            0.21
0                 DTC1             0.27            0.24
2     lr_lib_l1_lowerc             0.22            0.22 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.99
3           Bagging_20             0.99            0.98
2     lr_lib_l1_lowerc             0.98            0.98
5              rf_max7             0.97            0.98
4  Ada_lr_15_halflearn             0.69            0.69
1          SVC_default             0.60            0.62 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.71            0.48
0                 DTC1             0.69            0.68
2     lr_lib_l1_lowerc             0.62            0.60
5              rf_max7             0.54            0.56
4  Ada_lr_15_halflearn             0.17            0.14
1          SVC_default             0.15            0.13 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.93            0.93
4  Ada_lr_15_halflearn             0.93            0.93
0                 DTC1             0.92            0.93
2     lr_lib_l1_lowerc             0.92            0.92
3           Bagging_20             0.92            0.92
5              rf_max7             0.92            0.93 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.40            0.29
0                 DTC1             0.38            0.36
5              rf_max7             0.38            0.35
2     lr_lib_l1_lowerc             0.33            0.32
4  Ada_lr_15_halflearn             0.26            0.22
1          SVC_default             0.24            0.21 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.96
2     lr_lib_l1_lowerc             0.95            0.95
3           Bagging_20             0.95            0.95
5              rf_max7             0.95            0.95
4  Ada_lr_15_halflearn             0.79            0.79
1          SVC_default             0.73            0.74 

In [71]:
# Evaluate oversampling techniques on df2
print('For df2:\n')
fun_multi_clf_sampler_all_tasks(osampler_list,osampler_name_list,clf_list4,clf_name_list4,X_train2.values,y_train2,X_val2,y_val2,X_test2,y_test2,'data_source')
For df2:

Sampler:  RandomOverSampler(random_state=42) 

y value counts of resampled train set
 No     5231
Yes    5231
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.96            0.96
5              rf_max7             0.89            0.90
2     lr_lib_l1_lowerc             0.76            0.77
0                 DTC1             0.73            0.75
1          SVC_default             0.67            0.64
4  Ada_lr_15_halflearn             0.63            0.63 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.68            0.65
1          SVC_default             0.67            0.59
0                 DTC1             0.62            0.63
2     lr_lib_l1_lowerc             0.60            0.65
5              rf_max7             0.49            0.41
3           Bagging_20             0.27            0.20 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.99            0.99
5              rf_max7             0.90            0.92
2     lr_lib_l1_lowerc             0.77            0.78
0                 DTC1             0.73            0.75
1          SVC_default             0.67            0.65
4  Ada_lr_15_halflearn             0.62            0.62 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.49            0.44
5              rf_max7             0.15            0.18
2     lr_lib_l1_lowerc             0.09            0.11
0                 DTC1             0.08            0.10
1          SVC_default             0.07            0.07
4  Ada_lr_15_halflearn             0.06            0.07 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.98
1          SVC_default             0.98            0.97
2     lr_lib_l1_lowerc             0.98            0.98
4  Ada_lr_15_halflearn             0.98            0.98
5              rf_max7             0.98            0.97
3           Bagging_20             0.97            0.97 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.35            0.28
5              rf_max7             0.23            0.25
2     lr_lib_l1_lowerc             0.15            0.19
0                 DTC1             0.14            0.17
1          SVC_default             0.12            0.12
4  Ada_lr_15_halflearn             0.11            0.13 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.98            0.98
5              rf_max7             0.94            0.95
2     lr_lib_l1_lowerc             0.86            0.87
0                 DTC1             0.84            0.85
1          SVC_default             0.80            0.78
4  Ada_lr_15_halflearn             0.76            0.76 

Sampler:  ADASYN(random_state=42) 

y value counts of resampled train set
 No     5231
Yes    5205
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.96            0.96
3           Bagging_20             0.96            0.96
5              rf_max7             0.96            0.96
0                 DTC1             0.93            0.93
4  Ada_lr_15_halflearn             0.74            0.74
1          SVC_default             0.63            0.62 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.67            0.60
4  Ada_lr_15_halflearn             0.63            0.48
0                 DTC1             0.37            0.29
5              rf_max7             0.29            0.20
3           Bagging_20             0.22            0.21
2     lr_lib_l1_lowerc             0.06            0.05 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.99            1.00
3           Bagging_20             0.99            0.99
5              rf_max7             0.99            0.99
0                 DTC1             0.95            0.96
4  Ada_lr_15_halflearn             0.74            0.75
1          SVC_default             0.63            0.62 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.41            0.43
3           Bagging_20             0.36            0.50
2     lr_lib_l1_lowerc             0.25            0.36
0                 DTC1             0.20            0.23
4  Ada_lr_15_halflearn             0.08            0.08
1          SVC_default             0.06            0.06 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.97
1          SVC_default             0.98            0.97
4  Ada_lr_15_halflearn             0.98            0.97
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.97
5              rf_max7             0.97            0.97 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.34            0.27
3           Bagging_20             0.27            0.30
0                 DTC1             0.26            0.26
4  Ada_lr_15_halflearn             0.15            0.13
1          SVC_default             0.11            0.12
2     lr_lib_l1_lowerc             0.10            0.09 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
5              rf_max7             0.98            0.98
0                 DTC1             0.96            0.96
4  Ada_lr_15_halflearn             0.85            0.84
1          SVC_default             0.76            0.76 

Sampler:  SMOTE(random_state=42) 

y value counts of resampled train set
 No     5231
Yes    5231
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.96            0.96
2     lr_lib_l1_lowerc             0.96            0.96
3           Bagging_20             0.96            0.96
5              rf_max7             0.96            0.96
4  Ada_lr_15_halflearn             0.74            0.73
1          SVC_default             0.63            0.61 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.68            0.57
4  Ada_lr_15_halflearn             0.63            0.48
5              rf_max7             0.35            0.25
0                 DTC1             0.29            0.27
3           Bagging_20             0.24            0.24
2     lr_lib_l1_lowerc             0.08            0.05 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.99
2     lr_lib_l1_lowerc             0.99            0.99
3           Bagging_20             0.99            0.99
5              rf_max7             0.98            0.99
4  Ada_lr_15_halflearn             0.74            0.74
1          SVC_default             0.62            0.61 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.42            0.57
5              rf_max7             0.42            0.44
3           Bagging_20             0.38            0.46
2     lr_lib_l1_lowerc             0.31            0.31
4  Ada_lr_15_halflearn             0.08            0.07
1          SVC_default             0.06            0.06 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.98            0.97
4  Ada_lr_15_halflearn             0.98            0.97
5              rf_max7             0.98            0.97
0                 DTC1             0.97            0.97
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.97 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.38            0.32
0                 DTC1             0.34            0.36
3           Bagging_20             0.29            0.32
4  Ada_lr_15_halflearn             0.14            0.13
2     lr_lib_l1_lowerc             0.13            0.09
1          SVC_default             0.11            0.11 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.98
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
5              rf_max7             0.98            0.98
4  Ada_lr_15_halflearn             0.84            0.84
1          SVC_default             0.76            0.75 

Sampler:  SVMSMOTE(random_state=42) 

y value counts of resampled train set
 No     5231
Yes    2960
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.97            0.96
2     lr_lib_l1_lowerc             0.96            0.96
5              rf_max7             0.96            0.96
0                 DTC1             0.95            0.95
1          SVC_default             0.85            0.84
4  Ada_lr_15_halflearn             0.85            0.85 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.40            0.32
4  Ada_lr_15_halflearn             0.40            0.36
0                 DTC1             0.29            0.28
3           Bagging_20             0.22            0.19
5              rf_max7             0.21            0.12
2     lr_lib_l1_lowerc             0.08            0.05 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.99            0.99
3           Bagging_20             0.99            1.00
5              rf_max7             0.99            0.99
0                 DTC1             0.98            0.98
1          SVC_default             0.87            0.86
4  Ada_lr_15_halflearn             0.87            0.87 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.56            0.67
5              rf_max7             0.46            0.38
2     lr_lib_l1_lowerc             0.33            0.31
0                 DTC1             0.32            0.41
1          SVC_default             0.10            0.09
4  Ada_lr_15_halflearn             0.10            0.11 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.98            0.97
4  Ada_lr_15_halflearn             0.98            0.97
0                 DTC1             0.97            0.97
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.97
5              rf_max7             0.97            0.96 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.32            0.29
0                 DTC1             0.30            0.33
5              rf_max7             0.29            0.18
4  Ada_lr_15_halflearn             0.16            0.17
1          SVC_default             0.15            0.14
2     lr_lib_l1_lowerc             0.13            0.09 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.98
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
5              rf_max7             0.98            0.98
1          SVC_default             0.92            0.91
4  Ada_lr_15_halflearn             0.92            0.92 

Sampler:  BorderlineSMOTE(random_state=42) 

y value counts of resampled train set
 No     5231
Yes    5231
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.96            0.96
2     lr_lib_l1_lowerc             0.96            0.96
3           Bagging_20             0.96            0.96
5              rf_max7             0.94            0.93
4  Ada_lr_15_halflearn             0.72            0.70
1          SVC_default             0.60            0.59 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.70            0.51
1          SVC_default             0.68            0.61
0                 DTC1             0.29            0.28
5              rf_max7             0.27            0.19
3           Bagging_20             0.22            0.16
2     lr_lib_l1_lowerc             0.11            0.07 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.99            0.99
3           Bagging_20             0.99            0.99
0                 DTC1             0.98            0.99
5              rf_max7             0.96            0.96
4  Ada_lr_15_halflearn             0.72            0.71
1          SVC_default             0.60            0.59 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.45            0.55
0                 DTC1             0.33            0.47
2     lr_lib_l1_lowerc             0.33            0.31
5              rf_max7             0.20            0.17
4  Ada_lr_15_halflearn             0.08            0.07
1          SVC_default             0.06            0.06 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.99            0.97
1          SVC_default             0.98            0.97
0                 DTC1             0.97            0.97
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.96
5              rf_max7             0.97            0.96 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.31            0.35
3           Bagging_20             0.30            0.25
5              rf_max7             0.23            0.18
2     lr_lib_l1_lowerc             0.17            0.11
4  Ada_lr_15_halflearn             0.15            0.12
1          SVC_default             0.11            0.11 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.98
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
5              rf_max7             0.97            0.96
4  Ada_lr_15_halflearn             0.83            0.82
1          SVC_default             0.75            0.74 

Evaluating undersampling techniques on the three datasets

In [72]:
# Evaluate undersampling techniques on original df
print('For df:\n')
fun_multi_clf_sampler_all_tasks(usampler_list,usampler_name_list,clf_list4,clf_name_list4,X_train.values,y_train,X_val,y_val,X_test,y_test,'data_source')
For df:

Sampler:  RandomUnderSampler(random_state=42) 

y value counts of resampled train set
 No     787
Yes    787
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.72            0.72
5              rf_max7             0.72            0.71
3           Bagging_20             0.70            0.70
0                 DTC1             0.64            0.64
1          SVC_default             0.61            0.62
4  Ada_lr_15_halflearn             0.61            0.61 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.66            0.62
4  Ada_lr_15_halflearn             0.65            0.59
5              rf_max7             0.63            0.58
2     lr_lib_l1_lowerc             0.60            0.54
3           Bagging_20             0.59            0.56
1          SVC_default             0.58            0.56 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.74            0.75
5              rf_max7             0.73            0.73
3           Bagging_20             0.72            0.72
0                 DTC1             0.64            0.64
1          SVC_default             0.62            0.63
4  Ada_lr_15_halflearn             0.60            0.61 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.26            0.24
2     lr_lib_l1_lowerc             0.25            0.24
3           Bagging_20             0.23            0.22
0                 DTC1             0.21            0.20
4  Ada_lr_15_halflearn             0.19            0.18
1          SVC_default             0.18            0.18 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.93            0.92
2     lr_lib_l1_lowerc             0.93            0.92
5              rf_max7             0.93            0.92
3           Bagging_20             0.92            0.92
4  Ada_lr_15_halflearn             0.92            0.91
1          SVC_default             0.91            0.91 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.36            0.33
5              rf_max7             0.36            0.34
3           Bagging_20             0.34            0.32
0                 DTC1             0.32            0.30
4  Ada_lr_15_halflearn             0.30            0.27
1          SVC_default             0.28            0.27 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.82            0.82
5              rf_max7             0.82            0.82
3           Bagging_20             0.81            0.81
0                 DTC1             0.76            0.76
1          SVC_default             0.73            0.74
4  Ada_lr_15_halflearn             0.73            0.73 

Sampler:  NearMiss() 

y value counts of resampled train set
 No     787
Yes    787
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.75            0.74
1          SVC_default             0.66            0.68
5              rf_max7             0.62            0.62
2     lr_lib_l1_lowerc             0.61            0.63
3           Bagging_20             0.52            0.55
4  Ada_lr_15_halflearn             0.43            0.41 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.62            0.56
3           Bagging_20             0.61            0.61
2     lr_lib_l1_lowerc             0.59            0.57
5              rf_max7             0.57            0.56
0                 DTC1             0.46            0.41
1          SVC_default             0.32            0.30 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.79            0.78
1          SVC_default             0.71            0.73
5              rf_max7             0.63            0.63
2     lr_lib_l1_lowerc             0.62            0.64
3           Bagging_20             0.50            0.54
4  Ada_lr_15_halflearn             0.40            0.39 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.24            0.21
2     lr_lib_l1_lowerc             0.18            0.18
5              rf_max7             0.18            0.18
3           Bagging_20             0.15            0.16
1          SVC_default             0.14            0.14
4  Ada_lr_15_halflearn             0.13            0.12 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.91            0.90
2     lr_lib_l1_lowerc             0.91            0.91
5              rf_max7             0.91            0.91
3           Bagging_20             0.90            0.91
1          SVC_default             0.88            0.88
4  Ada_lr_15_halflearn             0.88            0.86 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.32            0.28
2     lr_lib_l1_lowerc             0.28            0.28
5              rf_max7             0.28            0.27
3           Bagging_20             0.25            0.25
4  Ada_lr_15_halflearn             0.22            0.19
1          SVC_default             0.19            0.19 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.84            0.84
1          SVC_default             0.79            0.80
2     lr_lib_l1_lowerc             0.74            0.75
5              rf_max7             0.74            0.75
3           Bagging_20             0.65            0.68
4  Ada_lr_15_halflearn             0.55            0.53 

Sampler:  NearMiss(version=2) 

y value counts of resampled train set
 No     787
Yes    787
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.52            0.52
4  Ada_lr_15_halflearn             0.50            0.49
1          SVC_default             0.49            0.49
5              rf_max7             0.39            0.38
0                 DTC1             0.26            0.26
3           Bagging_20             0.26            0.25 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.89            0.87
3           Bagging_20             0.89            0.88
5              rf_max7             0.84            0.84
2     lr_lib_l1_lowerc             0.75            0.74
4  Ada_lr_15_halflearn             0.73            0.73
1          SVC_default             0.64            0.66 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.49            0.49
4  Ada_lr_15_halflearn             0.47            0.46
1          SVC_default             0.46            0.47
5              rf_max7             0.32            0.31
0                 DTC1             0.17            0.17
3           Bagging_20             0.16            0.16 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.18            0.17
4  Ada_lr_15_halflearn             0.17            0.16
1          SVC_default             0.15            0.15
5              rf_max7             0.15            0.15
0                 DTC1             0.13            0.13
3           Bagging_20             0.13            0.13 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.93            0.93
5              rf_max7             0.93            0.93
4  Ada_lr_15_halflearn             0.92            0.92
0                 DTC1             0.91            0.90
3           Bagging_20             0.91            0.90
1          SVC_default             0.90            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.29            0.28
4  Ada_lr_15_halflearn             0.27            0.27
5              rf_max7             0.26            0.25
1          SVC_default             0.24            0.25
0                 DTC1             0.23            0.23
3           Bagging_20             0.23            0.23 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.64            0.64
4  Ada_lr_15_halflearn             0.62            0.61
1          SVC_default             0.61            0.61
5              rf_max7             0.48            0.46
0                 DTC1             0.28            0.28
3           Bagging_20             0.28            0.27 

Sampler:  NearMiss(version=3) 

y value counts of resampled train set
 No     787
Yes    787
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.79            0.78
5              rf_max7             0.69            0.68
3           Bagging_20             0.64            0.64
4  Ada_lr_15_halflearn             0.64            0.63
1          SVC_default             0.50            0.48
0                 DTC1             0.43            0.43 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.77            0.81
1          SVC_default             0.71            0.68
5              rf_max7             0.61            0.57
3           Bagging_20             0.60            0.63
4  Ada_lr_15_halflearn             0.56            0.58
2     lr_lib_l1_lowerc             0.53            0.50 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.83            0.82
5              rf_max7             0.70            0.70
3           Bagging_20             0.65            0.64
4  Ada_lr_15_halflearn             0.65            0.64
1          SVC_default             0.46            0.45
0                 DTC1             0.38            0.37 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.31            0.29
5              rf_max7             0.23            0.21
3           Bagging_20             0.20            0.20
4  Ada_lr_15_halflearn             0.19            0.19
1          SVC_default             0.16            0.15
0                 DTC1             0.15            0.16 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.92            0.93
1          SVC_default             0.92            0.91
2     lr_lib_l1_lowerc             0.92            0.92
3           Bagging_20             0.92            0.92
5              rf_max7             0.92            0.92
4  Ada_lr_15_halflearn             0.91            0.91 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.39            0.36
5              rf_max7             0.34            0.31
3           Bagging_20             0.30            0.31
4  Ada_lr_15_halflearn             0.29            0.28
0                 DTC1             0.26            0.26
1          SVC_default             0.26            0.25 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.87            0.87
5              rf_max7             0.80            0.79
3           Bagging_20             0.76            0.76
4  Ada_lr_15_halflearn             0.76            0.75
1          SVC_default             0.62            0.60
0                 DTC1             0.53            0.53 

Sampler:  EditedNearestNeighbours() 

y value counts of resampled train set
 No     3531
Yes     787
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.90            0.88
5              rf_max7             0.90            0.88
0                 DTC1             0.89            0.88
3           Bagging_20             0.88            0.86
1          SVC_default             0.87            0.87
4  Ada_lr_15_halflearn             0.87            0.87 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.34            0.28
2     lr_lib_l1_lowerc             0.30            0.23
5              rf_max7             0.29            0.23
0                 DTC1             0.28            0.27
4  Ada_lr_15_halflearn             0.02            0.01
1          SVC_default             0.00            0.00 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             1.00            1.00
4  Ada_lr_15_halflearn             1.00            1.00
5              rf_max7             0.99            0.98
0                 DTC1             0.98            0.97
2     lr_lib_l1_lowerc             0.98            0.97
3           Bagging_20             0.96            0.95 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.77            0.59
2     lr_lib_l1_lowerc             0.73            0.53
0                 DTC1             0.63            0.54
3           Bagging_20             0.53            0.42
4  Ada_lr_15_halflearn             0.36            0.33
1          SVC_default             0.00            0.00 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.91            0.90
3           Bagging_20             0.91            0.90
0                 DTC1             0.90            0.90
5              rf_max7             0.90            0.90
1          SVC_default             0.87            0.87
4  Ada_lr_15_halflearn             0.87            0.87 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.43            0.32
3           Bagging_20             0.42            0.34
5              rf_max7             0.42            0.34
0                 DTC1             0.39            0.36
4  Ada_lr_15_halflearn             0.03            0.02
1          SVC_default             0.00            0.00 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.94            0.93
2     lr_lib_l1_lowerc             0.94            0.93
5              rf_max7             0.94            0.94
1          SVC_default             0.93            0.93
3           Bagging_20             0.93            0.92
4  Ada_lr_15_halflearn             0.93            0.93 

Sampler:  RepeatedEditedNearestNeighbours() 

y value counts of resampled train set
 No     2694
Yes     787
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.87            0.86
1          SVC_default             0.87            0.87
2     lr_lib_l1_lowerc             0.87            0.85
5              rf_max7             0.87            0.85
4  Ada_lr_15_halflearn             0.85            0.84
3           Bagging_20             0.84            0.81 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.46            0.37
2     lr_lib_l1_lowerc             0.38            0.34
5              rf_max7             0.37            0.35
0                 DTC1             0.31            0.29
4  Ada_lr_15_halflearn             0.21            0.16
1          SVC_default             0.00            0.00 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             1.00            1.00
0                 DTC1             0.96            0.95
5              rf_max7             0.95            0.93
2     lr_lib_l1_lowerc             0.94            0.92
4  Ada_lr_15_halflearn             0.94            0.93
3           Bagging_20             0.89            0.87 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.51            0.44
5              rf_max7             0.51            0.41
2     lr_lib_l1_lowerc             0.48            0.39
3           Bagging_20             0.38            0.30
4  Ada_lr_15_halflearn             0.34            0.25
1          SVC_default             0.00            0.00 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.92            0.91
2     lr_lib_l1_lowerc             0.91            0.91
5              rf_max7             0.91            0.91
0                 DTC1             0.90            0.90
4  Ada_lr_15_halflearn             0.89            0.88
1          SVC_default             0.87            0.87 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.43            0.38
2     lr_lib_l1_lowerc             0.42            0.36
3           Bagging_20             0.42            0.33
0                 DTC1             0.39            0.35
4  Ada_lr_15_halflearn             0.26            0.20
1          SVC_default             0.00            0.00 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.93            0.92
1          SVC_default             0.93            0.93
2     lr_lib_l1_lowerc             0.93            0.91
5              rf_max7             0.93            0.92
4  Ada_lr_15_halflearn             0.91            0.91
3           Bagging_20             0.90            0.89 

Sampler:  AllKNN() 

y value counts of resampled train set
 No     3157
Yes     787
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.89            0.87
5              rf_max7             0.89            0.88
0                 DTC1             0.88            0.87
1          SVC_default             0.87            0.87
4  Ada_lr_15_halflearn             0.87            0.87
3           Bagging_20             0.85            0.84 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.37            0.31
0                 DTC1             0.34            0.27
2     lr_lib_l1_lowerc             0.32            0.25
5              rf_max7             0.32            0.28
4  Ada_lr_15_halflearn             0.05            0.04
1          SVC_default             0.00            0.00 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             1.00            1.00
4  Ada_lr_15_halflearn             0.99            0.99
0                 DTC1             0.97            0.96
2     lr_lib_l1_lowerc             0.97            0.96
5              rf_max7             0.97            0.96
3           Bagging_20             0.92            0.92 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.61            0.45
5              rf_max7             0.60            0.51
0                 DTC1             0.59            0.49
3           Bagging_20             0.41            0.35
4  Ada_lr_15_halflearn             0.39            0.34
1          SVC_default             0.00            0.00 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.91            0.90
2     lr_lib_l1_lowerc             0.91            0.90
3           Bagging_20             0.91            0.90
5              rf_max7             0.91            0.90
4  Ada_lr_15_halflearn             0.88            0.88
1          SVC_default             0.87            0.87 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.43            0.35
2     lr_lib_l1_lowerc             0.42            0.32
5              rf_max7             0.42            0.36
3           Bagging_20             0.39            0.33
4  Ada_lr_15_halflearn             0.10            0.07
1          SVC_default             0.00            0.00 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.94            0.93
2     lr_lib_l1_lowerc             0.94            0.93
5              rf_max7             0.94            0.93
1          SVC_default             0.93            0.93
4  Ada_lr_15_halflearn             0.93            0.93
3           Bagging_20             0.92            0.91 

In [73]:
# Evaluate undersampling techniques on df1
print('For df1:\n')
fun_multi_clf_sampler_all_tasks(usampler_list,usampler_name_list,clf_list4,clf_name_list4,X_train1.values,y_train1,X_val1,y_val1,X_test1,y_test1,'data_source')
For df1:

Sampler:  RandomUnderSampler(random_state=42) 

y value counts of resampled train set
 No     592
Yes    592
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.70            0.71
3           Bagging_20             0.68            0.67
0                 DTC1             0.66            0.65
5              rf_max7             0.65            0.66
4  Ada_lr_15_halflearn             0.60            0.58
1          SVC_default             0.58            0.58 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.69            0.67
0                 DTC1             0.68            0.64
4  Ada_lr_15_halflearn             0.67            0.63
2     lr_lib_l1_lowerc             0.64            0.59
1          SVC_default             0.62            0.59
3           Bagging_20             0.59            0.58 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.71            0.72
3           Bagging_20             0.70            0.68
0                 DTC1             0.66            0.65
5              rf_max7             0.65            0.66
4  Ada_lr_15_halflearn             0.59            0.58
1          SVC_default             0.57            0.58 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.20            0.18
0                 DTC1             0.18            0.16
3           Bagging_20             0.18            0.16
5              rf_max7             0.18            0.17
4  Ada_lr_15_halflearn             0.15            0.14
1          SVC_default             0.14            0.13 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.94
2     lr_lib_l1_lowerc             0.95            0.94
5              rf_max7             0.95            0.95
3           Bagging_20             0.94            0.94
4  Ada_lr_15_halflearn             0.94            0.94
1          SVC_default             0.93            0.93 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.30            0.28
0                 DTC1             0.29            0.26
5              rf_max7             0.29            0.27
3           Bagging_20             0.27            0.25
4  Ada_lr_15_halflearn             0.25            0.22
1          SVC_default             0.23            0.21 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.81            0.82
3           Bagging_20             0.80            0.79
0                 DTC1             0.78            0.77
5              rf_max7             0.77            0.78
4  Ada_lr_15_halflearn             0.72            0.72
1          SVC_default             0.71            0.72 

Sampler:  NearMiss() 

y value counts of resampled train set
 No     592
Yes    592
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.74            0.74
0                 DTC1             0.71            0.71
2     lr_lib_l1_lowerc             0.61            0.64
5              rf_max7             0.61            0.61
3           Bagging_20             0.50            0.50
4  Ada_lr_15_halflearn             0.42            0.44 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.62            0.70
2     lr_lib_l1_lowerc             0.60            0.62
4  Ada_lr_15_halflearn             0.60            0.65
5              rf_max7             0.60            0.61
0                 DTC1             0.55            0.49
1          SVC_default             0.21            0.28 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.80            0.79
0                 DTC1             0.72            0.73
5              rf_max7             0.62            0.61
2     lr_lib_l1_lowerc             0.61            0.64
3           Bagging_20             0.49            0.48
4  Ada_lr_15_halflearn             0.40            0.42 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.18            0.16
2     lr_lib_l1_lowerc             0.15            0.15
5              rf_max7             0.15            0.14
3           Bagging_20             0.12            0.12
1          SVC_default             0.11            0.12
4  Ada_lr_15_halflearn             0.10            0.10 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.93            0.93
2     lr_lib_l1_lowerc             0.93            0.94
5              rf_max7             0.93            0.94
3           Bagging_20             0.92            0.94
1          SVC_default             0.90            0.91
4  Ada_lr_15_halflearn             0.90            0.92 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.27            0.24
2     lr_lib_l1_lowerc             0.24            0.25
5              rf_max7             0.24            0.23
3           Bagging_20             0.20            0.21
4  Ada_lr_15_halflearn             0.17            0.18
1          SVC_default             0.14            0.17 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.85            0.85
0                 DTC1             0.82            0.82
2     lr_lib_l1_lowerc             0.74            0.76
5              rf_max7             0.74            0.74
3           Bagging_20             0.64            0.63
4  Ada_lr_15_halflearn             0.56            0.58 

Sampler:  NearMiss(version=2) 

y value counts of resampled train set
 No     592
Yes    592
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.48            0.49
1          SVC_default             0.46            0.47
4  Ada_lr_15_halflearn             0.46            0.47
5              rf_max7             0.36            0.37
0                 DTC1             0.21            0.21
3           Bagging_20             0.21            0.21 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.89            0.91
0                 DTC1             0.87            0.90
5              rf_max7             0.85            0.86
2     lr_lib_l1_lowerc             0.76            0.79
4  Ada_lr_15_halflearn             0.76            0.73
1          SVC_default             0.64            0.66 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.45            0.46
1          SVC_default             0.44            0.46
4  Ada_lr_15_halflearn             0.42            0.44
5              rf_max7             0.31            0.32
0                 DTC1             0.14            0.14
3           Bagging_20             0.13            0.13 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.14            0.13
4  Ada_lr_15_halflearn             0.13            0.12
5              rf_max7             0.12            0.12
1          SVC_default             0.11            0.11
0                 DTC1             0.10            0.10
3           Bagging_20             0.10            0.10 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.95            0.96
2     lr_lib_l1_lowerc             0.94            0.95
4  Ada_lr_15_halflearn             0.94            0.94
1          SVC_default             0.91            0.93
3           Bagging_20             0.91            0.93
0                 DTC1             0.90            0.93 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.23            0.23
4  Ada_lr_15_halflearn             0.22            0.21
5              rf_max7             0.21            0.21
1          SVC_default             0.19            0.19
0                 DTC1             0.18            0.18
3           Bagging_20             0.18            0.18 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.61            0.62
1          SVC_default             0.59            0.61
4  Ada_lr_15_halflearn             0.58            0.60
5              rf_max7             0.47            0.48
0                 DTC1             0.24            0.24
3           Bagging_20             0.23            0.23 

Sampler:  NearMiss(version=3) 

y value counts of resampled train set
 No     592
Yes    592
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.78            0.77
4  Ada_lr_15_halflearn             0.63            0.63
3           Bagging_20             0.62            0.60
5              rf_max7             0.59            0.60
0                 DTC1             0.43            0.44
1          SVC_default             0.37            0.38 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.81            0.73
0                 DTC1             0.80            0.74
5              rf_max7             0.74            0.64
3           Bagging_20             0.67            0.58
2     lr_lib_l1_lowerc             0.56            0.52
4  Ada_lr_15_halflearn             0.56            0.56 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.80            0.80
4  Ada_lr_15_halflearn             0.64            0.64
3           Bagging_20             0.62            0.61
5              rf_max7             0.58            0.59
0                 DTC1             0.39            0.41
1          SVC_default             0.32            0.34 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.24            0.21
3           Bagging_20             0.16            0.13
5              rf_max7             0.16            0.14
4  Ada_lr_15_halflearn             0.15            0.14
0                 DTC1             0.13            0.12
1          SVC_default             0.12            0.10 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.95            0.94
0                 DTC1             0.94            0.94
1          SVC_default             0.94            0.92
2     lr_lib_l1_lowerc             0.94            0.94
3           Bagging_20             0.94            0.93
4  Ada_lr_15_halflearn             0.93            0.93 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.33            0.30
5              rf_max7             0.27            0.23
3           Bagging_20             0.26            0.22
4  Ada_lr_15_halflearn             0.23            0.22
0                 DTC1             0.22            0.20
1          SVC_default             0.21            0.18 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.87            0.87
3           Bagging_20             0.75            0.73
4  Ada_lr_15_halflearn             0.75            0.76
5              rf_max7             0.72            0.73
0                 DTC1             0.56            0.57
1          SVC_default             0.47            0.49 

Sampler:  EditedNearestNeighbours() 

y value counts of resampled train set
 No     3882
Yes     592
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.92            0.92
0                 DTC1             0.91            0.91
2     lr_lib_l1_lowerc             0.91            0.92
3           Bagging_20             0.91            0.90
1          SVC_default             0.90            0.90
4  Ada_lr_15_halflearn             0.90            0.90 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.31            0.23
0                 DTC1             0.29            0.22
2     lr_lib_l1_lowerc             0.26            0.23
5              rf_max7             0.25            0.22
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             1.00            1.00
4  Ada_lr_15_halflearn             1.00            1.00
2     lr_lib_l1_lowerc             0.99            0.99
5              rf_max7             0.99            0.99
0                 DTC1             0.98            0.98
3           Bagging_20             0.98            0.97 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.78            0.74
2     lr_lib_l1_lowerc             0.70            0.70
0                 DTC1             0.62            0.54
3           Bagging_20             0.60            0.48
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.93            0.92
0                 DTC1             0.92            0.92
2     lr_lib_l1_lowerc             0.92            0.92
5              rf_max7             0.92            0.92
1          SVC_default             0.90            0.90
4  Ada_lr_15_halflearn             0.90            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.41            0.31
0                 DTC1             0.39            0.32
5              rf_max7             0.38            0.34
2     lr_lib_l1_lowerc             0.37            0.35
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.96            0.96
0                 DTC1             0.95            0.95
1          SVC_default             0.95            0.95
2     lr_lib_l1_lowerc             0.95            0.96
3           Bagging_20             0.95            0.95
4  Ada_lr_15_halflearn             0.95            0.95 

Sampler:  RepeatedEditedNearestNeighbours() 

y value counts of resampled train set
 No     3184
Yes     592
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.92            0.91
2     lr_lib_l1_lowerc             0.91            0.91
1          SVC_default             0.90            0.90
3           Bagging_20             0.90            0.89
4  Ada_lr_15_halflearn             0.90            0.90
0                 DTC1             0.89            0.90 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.39            0.31
0                 DTC1             0.36            0.29
5              rf_max7             0.29            0.23
2     lr_lib_l1_lowerc             0.27            0.23
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             1.00            1.00
4  Ada_lr_15_halflearn             1.00            1.00
5              rf_max7             0.99            0.99
2     lr_lib_l1_lowerc             0.98            0.99
3           Bagging_20             0.95            0.95
0                 DTC1             0.94            0.96 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.72            0.65
2     lr_lib_l1_lowerc             0.66            0.64
3           Bagging_20             0.49            0.41
0                 DTC1             0.42            0.43
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.93            0.93
3           Bagging_20             0.93            0.93
2     lr_lib_l1_lowerc             0.92            0.92
5              rf_max7             0.92            0.92
1          SVC_default             0.90            0.90
4  Ada_lr_15_halflearn             0.90            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.43            0.35
5              rf_max7             0.41            0.34
0                 DTC1             0.39            0.35
2     lr_lib_l1_lowerc             0.38            0.34
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.95            0.95
2     lr_lib_l1_lowerc             0.95            0.95
4  Ada_lr_15_halflearn             0.95            0.95
5              rf_max7             0.95            0.95
0                 DTC1             0.94            0.94
3           Bagging_20             0.94            0.94 

Sampler:  AllKNN() 

y value counts of resampled train set
 No     3601
Yes     592
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.92            0.92
5              rf_max7             0.92            0.92
2     lr_lib_l1_lowerc             0.91            0.92
1          SVC_default             0.90            0.90
3           Bagging_20             0.90            0.90
4  Ada_lr_15_halflearn             0.90            0.90 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.34            0.27
0                 DTC1             0.26            0.23
2     lr_lib_l1_lowerc             0.26            0.23
5              rf_max7             0.26            0.22
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             1.00            1.00
4  Ada_lr_15_halflearn             1.00            1.00
0                 DTC1             0.99            0.99
2     lr_lib_l1_lowerc             0.99            0.99
5              rf_max7             0.99            0.99
3           Bagging_20             0.96            0.97 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.78            0.78
0                 DTC1             0.76            0.72
2     lr_lib_l1_lowerc             0.70            0.69
3           Bagging_20             0.52            0.46
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.93            0.93
0                 DTC1             0.92            0.92
2     lr_lib_l1_lowerc             0.92            0.92
5              rf_max7             0.92            0.92
1          SVC_default             0.90            0.90
4  Ada_lr_15_halflearn             0.90            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.41            0.34
0                 DTC1             0.39            0.35
5              rf_max7             0.38            0.34
2     lr_lib_l1_lowerc             0.37            0.35
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.96            0.96
5              rf_max7             0.96            0.96
1          SVC_default             0.95            0.95
2     lr_lib_l1_lowerc             0.95            0.96
3           Bagging_20             0.95            0.95
4  Ada_lr_15_halflearn             0.95            0.95 

In [74]:
# Evaluate undersampling techniques on df2
print('For df2:\n')
fun_multi_clf_sampler_all_tasks(usampler_list,usampler_name_list,clf_list4,clf_name_list4,X_train2.values,y_train2,X_val2,y_val2,X_test2,y_test2,'data_source')
For df2:

Sampler:  RandomUnderSampler(random_state=42) 

y value counts of resampled train set
 No     185
Yes    185
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.76            0.79
2     lr_lib_l1_lowerc             0.74            0.75
0                 DTC1             0.72            0.73
3           Bagging_20             0.72            0.73
4  Ada_lr_15_halflearn             0.63            0.61
1          SVC_default             0.48            0.46 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.75            0.65
4  Ada_lr_15_halflearn             0.68            0.56
0                 DTC1             0.65            0.67
5              rf_max7             0.65            0.57
2     lr_lib_l1_lowerc             0.60            0.67
3           Bagging_20             0.57            0.56 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.76            0.80
2     lr_lib_l1_lowerc             0.75            0.75
0                 DTC1             0.72            0.74
3           Bagging_20             0.72            0.74
4  Ada_lr_15_halflearn             0.63            0.62
1          SVC_default             0.47            0.45 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.09            0.11
0                 DTC1             0.08            0.10
2     lr_lib_l1_lowerc             0.08            0.11
3           Bagging_20             0.07            0.09
4  Ada_lr_15_halflearn             0.06            0.06
1          SVC_default             0.05            0.05 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.98
1          SVC_default             0.98            0.97
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.97
4  Ada_lr_15_halflearn             0.98            0.97
5              rf_max7             0.98            0.98 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.16            0.19
0                 DTC1             0.14            0.17
2     lr_lib_l1_lowerc             0.14            0.18
3           Bagging_20             0.12            0.15
4  Ada_lr_15_halflearn             0.11            0.11
1          SVC_default             0.09            0.09 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.86            0.88
2     lr_lib_l1_lowerc             0.85            0.85
0                 DTC1             0.83            0.84
3           Bagging_20             0.83            0.84
4  Ada_lr_15_halflearn             0.76            0.75
1          SVC_default             0.63            0.62 

Sampler:  NearMiss() 

y value counts of resampled train set
 No     185
Yes    185
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.80            0.80
0                 DTC1             0.47            0.48
2     lr_lib_l1_lowerc             0.45            0.47
5              rf_max7             0.43            0.44
3           Bagging_20             0.32            0.35
4  Ada_lr_15_halflearn             0.32            0.33 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.78            0.72
5              rf_max7             0.73            0.63
0                 DTC1             0.62            0.48
2     lr_lib_l1_lowerc             0.46            0.53
4  Ada_lr_15_halflearn             0.38            0.63
1          SVC_default             0.13            0.19 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.83            0.82
0                 DTC1             0.47            0.48
2     lr_lib_l1_lowerc             0.45            0.47
5              rf_max7             0.42            0.43
4  Ada_lr_15_halflearn             0.32            0.31
3           Bagging_20             0.30            0.33 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.04            0.04
3           Bagging_20             0.04            0.04
5              rf_max7             0.04            0.05
1          SVC_default             0.03            0.04
2     lr_lib_l1_lowerc             0.03            0.04
4  Ada_lr_15_halflearn             0.02            0.04 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.98            0.96
0                 DTC1             0.97            0.96
3           Bagging_20             0.97            0.96
1          SVC_default             0.96            0.96
2     lr_lib_l1_lowerc             0.96            0.96
4  Ada_lr_15_halflearn             0.94            0.95 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.08            0.07
5              rf_max7             0.08            0.08
3           Bagging_20             0.07            0.08
2     lr_lib_l1_lowerc             0.06            0.08
1          SVC_default             0.04            0.07
4  Ada_lr_15_halflearn             0.04            0.07 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.89            0.89
0                 DTC1             0.63            0.64
2     lr_lib_l1_lowerc             0.62            0.63
5              rf_max7             0.59            0.60
4  Ada_lr_15_halflearn             0.48            0.47
3           Bagging_20             0.46            0.49 

Sampler:  NearMiss(version=2) 

y value counts of resampled train set
 No     185
Yes    185
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.59            0.57
1          SVC_default             0.51            0.49
4  Ada_lr_15_halflearn             0.51            0.48
5              rf_max7             0.31            0.32
0                 DTC1             0.10            0.10
3           Bagging_20             0.10            0.10 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.97            0.97
0                 DTC1             0.95            0.97
5              rf_max7             0.89            0.89
4  Ada_lr_15_halflearn             0.78            0.72
2     lr_lib_l1_lowerc             0.76            0.84
1          SVC_default             0.65            0.60 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.59            0.56
1          SVC_default             0.50            0.49
4  Ada_lr_15_halflearn             0.50            0.47
5              rf_max7             0.29            0.29
0                 DTC1             0.07            0.06
3           Bagging_20             0.07            0.06 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.06            0.08
1          SVC_default             0.05            0.05
4  Ada_lr_15_halflearn             0.05            0.06
0                 DTC1             0.04            0.04
3           Bagging_20             0.04            0.04
5              rf_max7             0.04            0.05 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.99            0.99
5              rf_max7             0.99            0.98
0                 DTC1             0.98            0.98
1          SVC_default             0.98            0.97
3           Bagging_20             0.98            0.98
4  Ada_lr_15_halflearn             0.98            0.97 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.12            0.14
4  Ada_lr_15_halflearn             0.10            0.10
1          SVC_default             0.08            0.09
5              rf_max7             0.08            0.10
0                 DTC1             0.07            0.08
3           Bagging_20             0.07            0.08 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.73            0.72
1          SVC_default             0.66            0.65
4  Ada_lr_15_halflearn             0.66            0.63
5              rf_max7             0.45            0.45
0                 DTC1             0.13            0.12
3           Bagging_20             0.13            0.12 

Sampler:  NearMiss(version=3) 

y value counts of resampled train set
 No     185
Yes    185
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.72            0.73
4  Ada_lr_15_halflearn             0.60            0.60
5              rf_max7             0.58            0.60
3           Bagging_20             0.54            0.55
0                 DTC1             0.41            0.41
1          SVC_default             0.18            0.19 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.90            0.81
0                 DTC1             0.84            0.79
4  Ada_lr_15_halflearn             0.73            0.61
3           Bagging_20             0.71            0.69
5              rf_max7             0.67            0.73
2     lr_lib_l1_lowerc             0.65            0.63 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.73            0.74
4  Ada_lr_15_halflearn             0.60            0.60
5              rf_max7             0.58            0.59
3           Bagging_20             0.54            0.55
0                 DTC1             0.40            0.39
1          SVC_default             0.16            0.16 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.08            0.09
4  Ada_lr_15_halflearn             0.06            0.06
0                 DTC1             0.05            0.05
3           Bagging_20             0.05            0.06
5              rf_max7             0.05            0.07
1          SVC_default             0.04            0.04 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.98
1          SVC_default             0.98            0.95
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
4  Ada_lr_15_halflearn             0.98            0.97
5              rf_max7             0.98            0.98 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.14            0.16
4  Ada_lr_15_halflearn             0.11            0.11
3           Bagging_20             0.10            0.11
5              rf_max7             0.10            0.13
0                 DTC1             0.09            0.10
1          SVC_default             0.07            0.08 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.83            0.84
4  Ada_lr_15_halflearn             0.74            0.74
5              rf_max7             0.73            0.74
3           Bagging_20             0.69            0.70
0                 DTC1             0.57            0.56
1          SVC_default             0.27            0.28 

Sampler:  EditedNearestNeighbours() 

y value counts of resampled train set
 No     4730
Yes     185
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.97            0.97
1          SVC_default             0.97            0.96
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.96
4  Ada_lr_15_halflearn             0.97            0.96
5              rf_max7             0.97            0.96 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.25            0.27
3           Bagging_20             0.22            0.15
2     lr_lib_l1_lowerc             0.13            0.07
5              rf_max7             0.03            0.03
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             1.00             1.0
2     lr_lib_l1_lowerc             1.00             1.0
4  Ada_lr_15_halflearn             1.00             1.0
5              rf_max7             1.00             1.0
0                 DTC1             0.99             1.0
3           Bagging_20             0.99             1.0 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             1.00            0.67
2     lr_lib_l1_lowerc             0.57            0.56
0                 DTC1             0.55            0.74
3           Bagging_20             0.54            0.58
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.97            0.97
1          SVC_default             0.97            0.96
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.96
4  Ada_lr_15_halflearn             0.97            0.96
5              rf_max7             0.97            0.96 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.35            0.39
3           Bagging_20             0.31            0.23
2     lr_lib_l1_lowerc             0.21            0.12
5              rf_max7             0.06            0.05
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.98
1          SVC_default             0.98            0.98
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
4  Ada_lr_15_halflearn             0.98            0.98
5              rf_max7             0.98            0.98 

Sampler:  RepeatedEditedNearestNeighbours() 

y value counts of resampled train set
 No     4534
Yes     185
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.97            0.97
1          SVC_default             0.97            0.96
2     lr_lib_l1_lowerc             0.97            0.96
4  Ada_lr_15_halflearn             0.97            0.96
5              rf_max7             0.97            0.96
3           Bagging_20             0.96            0.96 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.29            0.16
0                 DTC1             0.25            0.27
2     lr_lib_l1_lowerc             0.13            0.07
5              rf_max7             0.05            0.04
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             1.00            1.00
2     lr_lib_l1_lowerc             1.00            1.00
4  Ada_lr_15_halflearn             1.00            1.00
5              rf_max7             1.00            1.00
0                 DTC1             0.99            1.00
3           Bagging_20             0.99            0.99 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.75            0.75
2     lr_lib_l1_lowerc             0.57            0.45
0                 DTC1             0.55            0.74
3           Bagging_20             0.49            0.46
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.97            0.97
1          SVC_default             0.97            0.96
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.96
4  Ada_lr_15_halflearn             0.97            0.96
5              rf_max7             0.97            0.96 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.36            0.24
0                 DTC1             0.35            0.39
2     lr_lib_l1_lowerc             0.21            0.12
5              rf_max7             0.09            0.08
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.98
1          SVC_default             0.98            0.98
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
4  Ada_lr_15_halflearn             0.98            0.98
5              rf_max7             0.98            0.98 

Sampler:  AllKNN() 

y value counts of resampled train set
 No     4646
Yes     185
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.97            0.97
1          SVC_default             0.97            0.96
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.96
4  Ada_lr_15_halflearn             0.97            0.96
5              rf_max7             0.97            0.96 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.25            0.27
3           Bagging_20             0.22            0.16
2     lr_lib_l1_lowerc             0.13            0.07
5              rf_max7             0.05            0.05
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             1.00             1.0
2     lr_lib_l1_lowerc             1.00             1.0
4  Ada_lr_15_halflearn             1.00             1.0
5              rf_max7             1.00             1.0
0                 DTC1             0.99             1.0
3           Bagging_20             0.99             1.0 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.75            0.80
2     lr_lib_l1_lowerc             0.57            0.45
0                 DTC1             0.55            0.74
3           Bagging_20             0.50            0.63
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.97            0.97
1          SVC_default             0.97            0.96
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.96
4  Ada_lr_15_halflearn             0.97            0.96
5              rf_max7             0.97            0.96 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.35            0.39
3           Bagging_20             0.31            0.26
2     lr_lib_l1_lowerc             0.21            0.12
5              rf_max7             0.09            0.10
1          SVC_default             0.00            0.00
4  Ada_lr_15_halflearn             0.00            0.00 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.98
1          SVC_default             0.98            0.98
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
4  Ada_lr_15_halflearn             0.98            0.98
5              rf_max7             0.98            0.98 

Evaluating techniques that combine oversampling and undersampling

In [75]:
# Evaluate combined resampling techniques on original df
print('For df:\n')
fun_multi_clf_sampler_all_tasks(csampler_list,csampler_name_list,clf_list4,clf_name_list4,X_train.values,y_train,X_val,y_val,X_test,y_test,'data_source')
For df:

Sampler:  SMOTEENN(random_state=42) 

y value counts of resampled train set
 Yes    4175
No     2670
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.86            0.84
3           Bagging_20             0.83            0.81
5              rf_max7             0.75            0.75
0                 DTC1             0.70            0.70
4  Ada_lr_15_halflearn             0.51            0.51
1          SVC_default             0.37            0.36 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.85            0.83
4  Ada_lr_15_halflearn             0.75            0.75
0                 DTC1             0.59            0.55
5              rf_max7             0.56            0.54
3           Bagging_20             0.39            0.33
2     lr_lib_l1_lowerc             0.38            0.33 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.93            0.92
3           Bagging_20             0.90            0.88
5              rf_max7             0.78            0.78
0                 DTC1             0.72            0.73
4  Ada_lr_15_halflearn             0.47            0.47
1          SVC_default             0.30            0.29 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.43            0.36
3           Bagging_20             0.36            0.29
5              rf_max7             0.28            0.26
0                 DTC1             0.24            0.22
4  Ada_lr_15_halflearn             0.17            0.17
1          SVC_default             0.15            0.14 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.93            0.92
4  Ada_lr_15_halflearn             0.93            0.93
0                 DTC1             0.92            0.92
5              rf_max7             0.92            0.92
2     lr_lib_l1_lowerc             0.91            0.90
3           Bagging_20             0.91            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.40            0.35
3           Bagging_20             0.37            0.31
5              rf_max7             0.37            0.35
0                 DTC1             0.34            0.32
4  Ada_lr_15_halflearn             0.28            0.28
1          SVC_default             0.26            0.25 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.92            0.91
3           Bagging_20             0.90            0.89
5              rf_max7             0.85            0.84
0                 DTC1             0.81            0.81
4  Ada_lr_15_halflearn             0.63            0.63
1          SVC_default             0.46            0.44 

Sampler:  SMOTETomek(random_state=42) 

y value counts of resampled train set
 Yes    5005
No     5005
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.90            0.89
2     lr_lib_l1_lowerc             0.89            0.88
5              rf_max7             0.89            0.88
3           Bagging_20             0.88            0.88
4  Ada_lr_15_halflearn             0.69            0.68
1          SVC_default             0.62            0.61 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.58            0.56
4  Ada_lr_15_halflearn             0.57            0.50
5              rf_max7             0.30            0.27
2     lr_lib_l1_lowerc             0.29            0.22
0                 DTC1             0.27            0.22
3           Bagging_20             0.27            0.22 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.98
2     lr_lib_l1_lowerc             0.98            0.98
5              rf_max7             0.98            0.96
3           Bagging_20             0.97            0.97
4  Ada_lr_15_halflearn             0.71            0.70
1          SVC_default             0.62            0.62 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.83            0.65
2     lr_lib_l1_lowerc             0.70            0.58
5              rf_max7             0.65            0.51
3           Bagging_20             0.58            0.52
4  Ada_lr_15_halflearn             0.23            0.20
1          SVC_default             0.19            0.17 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.92            0.91
1          SVC_default             0.91            0.91
5              rf_max7             0.91            0.90
0                 DTC1             0.90            0.90
2     lr_lib_l1_lowerc             0.90            0.90
3           Bagging_20             0.90            0.90 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.41            0.33
2     lr_lib_l1_lowerc             0.41            0.32
5              rf_max7             0.41            0.35
3           Bagging_20             0.36            0.31
4  Ada_lr_15_halflearn             0.32            0.28
1          SVC_default             0.28            0.27 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.94
2     lr_lib_l1_lowerc             0.94            0.94
5              rf_max7             0.94            0.93
3           Bagging_20             0.93            0.93
4  Ada_lr_15_halflearn             0.80            0.79
1          SVC_default             0.74            0.74 

In [76]:
# Evaluate combined resampling techniques on df1
print('For df1:\n')
fun_multi_clf_sampler_all_tasks(csampler_list,csampler_name_list,clf_list4,clf_name_list4,X_train1.values,y_train1,X_val1,y_val1,X_test1,y_test1,'data_source')
For df1:

Sampler:  SMOTEENN(random_state=42) 

y value counts of resampled train set
 Yes    4305
No     2949
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.90            0.90
3           Bagging_20             0.89            0.88
5              rf_max7             0.83            0.82
0                 DTC1             0.71            0.70
4  Ada_lr_15_halflearn             0.51            0.51
1          SVC_default             0.33            0.34 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.87            0.86
4  Ada_lr_15_halflearn             0.77            0.72
0                 DTC1             0.61            0.55
5              rf_max7             0.49            0.38
3           Bagging_20             0.36            0.30
2     lr_lib_l1_lowerc             0.29            0.24 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.97            0.97
3           Bagging_20             0.95            0.94
5              rf_max7             0.87            0.87
0                 DTC1             0.72            0.71
4  Ada_lr_15_halflearn             0.48            0.49
1          SVC_default             0.27            0.28 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.51            0.43
3           Bagging_20             0.46            0.35
5              rf_max7             0.30            0.23
0                 DTC1             0.20            0.17
4  Ada_lr_15_halflearn             0.14            0.13
1          SVC_default             0.12            0.11 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.95            0.95
4  Ada_lr_15_halflearn             0.95            0.94
0                 DTC1             0.94            0.94
5              rf_max7             0.94            0.93
3           Bagging_20             0.93            0.93
2     lr_lib_l1_lowerc             0.92            0.92 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.41            0.33
2     lr_lib_l1_lowerc             0.37            0.31
5              rf_max7             0.37            0.29
0                 DTC1             0.30            0.26
4  Ada_lr_15_halflearn             0.24            0.22
1          SVC_default             0.21            0.20 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.95            0.94
3           Bagging_20             0.94            0.93
5              rf_max7             0.90            0.90
0                 DTC1             0.82            0.81
4  Ada_lr_15_halflearn             0.64            0.65
1          SVC_default             0.42            0.44 

Sampler:  SMOTETomek(random_state=42) 

y value counts of resampled train set
 No     5026
Yes    5026
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.91            0.92
2     lr_lib_l1_lowerc             0.91            0.91
3           Bagging_20             0.91            0.90
5              rf_max7             0.91            0.91
4  Ada_lr_15_halflearn             0.65            0.65
1          SVC_default             0.59            0.59 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.64            0.55
4  Ada_lr_15_halflearn             0.60            0.51
5              rf_max7             0.29            0.24
3           Bagging_20             0.28            0.23
0                 DTC1             0.27            0.24
2     lr_lib_l1_lowerc             0.23            0.22 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.99
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.97
5              rf_max7             0.98            0.98
4  Ada_lr_15_halflearn             0.66            0.67
1          SVC_default             0.58            0.60 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.69            0.68
2     lr_lib_l1_lowerc             0.62            0.60
3           Bagging_20             0.62            0.48
5              rf_max7             0.61            0.56
4  Ada_lr_15_halflearn             0.17            0.14
1          SVC_default             0.15            0.13 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.94            0.93
4  Ada_lr_15_halflearn             0.94            0.93
0                 DTC1             0.92            0.93
2     lr_lib_l1_lowerc             0.92            0.92
3           Bagging_20             0.92            0.92
5              rf_max7             0.92            0.92 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
3           Bagging_20             0.39            0.32
5              rf_max7             0.39            0.34
0                 DTC1             0.38            0.36
2     lr_lib_l1_lowerc             0.34            0.33
4  Ada_lr_15_halflearn             0.26            0.22
1          SVC_default             0.24            0.20 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.95            0.96
2     lr_lib_l1_lowerc             0.95            0.95
3           Bagging_20             0.95            0.95
5              rf_max7             0.95            0.95
4  Ada_lr_15_halflearn             0.77            0.78
1          SVC_default             0.72            0.73 

In [77]:
# Evaluate combined resampling techniques on df2
print('For df2:\n')
fun_multi_clf_sampler_all_tasks(csampler_list,csampler_name_list,clf_list4,clf_name_list4,X_train2.values,y_train2,X_val2,y_val2,X_test2,y_test2,'data_source')
For df2:

Sampler:  SMOTEENN(random_state=42) 

y value counts of resampled train set
 Yes    4549
No     3467
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.96            0.95
3           Bagging_20             0.95            0.94
5              rf_max7             0.94            0.94
0                 DTC1             0.69            0.70
4  Ada_lr_15_halflearn             0.54            0.52
1          SVC_default             0.30            0.30 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.86            0.76
4  Ada_lr_15_halflearn             0.78            0.68
0                 DTC1             0.62            0.55
5              rf_max7             0.40            0.31
3           Bagging_20             0.24            0.28
2     lr_lib_l1_lowerc             0.19            0.08 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.98            0.99
3           Bagging_20             0.97            0.97
5              rf_max7             0.96            0.96
0                 DTC1             0.69            0.71
4  Ada_lr_15_halflearn             0.53            0.52
1          SVC_default             0.28            0.28 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.30            0.25
5              rf_max7             0.26            0.26
3           Bagging_20             0.23            0.27
0                 DTC1             0.07            0.08
4  Ada_lr_15_halflearn             0.06            0.06
1          SVC_default             0.04            0.04 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
4  Ada_lr_15_halflearn             0.99            0.97
0                 DTC1             0.98            0.97
1          SVC_default             0.98            0.96
5              rf_max7             0.98            0.97
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.97 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.31            0.28
2     lr_lib_l1_lowerc             0.23            0.12
3           Bagging_20             0.23            0.28
0                 DTC1             0.12            0.13
4  Ada_lr_15_halflearn             0.11            0.11
1          SVC_default             0.08            0.08 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.97            0.97
5              rf_max7             0.97            0.97
0                 DTC1             0.81            0.82
4  Ada_lr_15_halflearn             0.69            0.68
1          SVC_default             0.44            0.43 

Sampler:  SMOTETomek(random_state=42) 

y value counts of resampled train set
 No     5118
Yes    5118
Name: IsBadBuy, dtype: int64 

overall_accuracy from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.96            0.96
2     lr_lib_l1_lowerc             0.96            0.96
3           Bagging_20             0.96            0.96
5              rf_max7             0.96            0.96
4  Ada_lr_15_halflearn             0.71            0.71
1          SVC_default             0.62            0.61 

recall_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.68            0.59
4  Ada_lr_15_halflearn             0.63            0.53
5              rf_max7             0.32            0.21
0                 DTC1             0.29            0.27
3           Bagging_20             0.21            0.19
2     lr_lib_l1_lowerc             0.08            0.05 

recall_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.99            0.99
2     lr_lib_l1_lowerc             0.99            0.99
3           Bagging_20             0.99            0.99
5              rf_max7             0.98            0.99
4  Ada_lr_15_halflearn             0.71            0.72
1          SVC_default             0.62            0.61 

precision_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.43            0.42
0                 DTC1             0.42            0.57
3           Bagging_20             0.39            0.41
2     lr_lib_l1_lowerc             0.26            0.29
4  Ada_lr_15_halflearn             0.07            0.08
1          SVC_default             0.06            0.06 

precision_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
1          SVC_default             0.98            0.97
4  Ada_lr_15_halflearn             0.98            0.97
5              rf_max7             0.98            0.97
0                 DTC1             0.97            0.97
2     lr_lib_l1_lowerc             0.97            0.96
3           Bagging_20             0.97            0.97 

f1_yes from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
5              rf_max7             0.36            0.28
0                 DTC1             0.34            0.36
3           Bagging_20             0.27            0.26
4  Ada_lr_15_halflearn             0.13            0.13
2     lr_lib_l1_lowerc             0.12            0.09
1          SVC_default             0.11            0.11 

f1_no from test (left) and from validation (right)

              clf_name  test_set_result  val_set_result
0                 DTC1             0.98            0.98
2     lr_lib_l1_lowerc             0.98            0.98
3           Bagging_20             0.98            0.98
5              rf_max7             0.98            0.98
4  Ada_lr_15_halflearn             0.83            0.82
1          SVC_default             0.76            0.75 

Balanced ensemble classifiers

Note the data is automatically balanced with these classifiers, so the function will have "None" for the resampler parameter.

In [78]:
# Evaluate balanced ensemble classifiers on original df
print('For df:\n')
fun_split_val_multi_clf_all_tasks('None',be_clf_list, be_clf_name_list,X_train,y_train,X_val,y_val,X_test,y_test,'data_source')
For df:

overall_accuracy from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.79            0.79
1                RUSBoost             0.72            0.72
2  EasyEnsembleClassifier             0.71            0.71 

recall_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
1                RUSBoost             0.57            0.56
2  EasyEnsembleClassifier             0.57            0.58
0        Balanced_Bagging             0.43            0.41 

recall_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.84            0.84
1                RUSBoost             0.75            0.75
2  EasyEnsembleClassifier             0.73            0.73 

precision_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.28            0.27
1                RUSBoost             0.25            0.24
2  EasyEnsembleClassifier             0.24            0.24 

precision_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
1                RUSBoost             0.92            0.92
2  EasyEnsembleClassifier             0.92            0.92
0        Balanced_Bagging             0.91            0.91 

f1_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
1                RUSBoost             0.35            0.34
0        Balanced_Bagging             0.34            0.33
2  EasyEnsembleClassifier             0.34            0.33 

f1_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.87            0.87
1                RUSBoost             0.83            0.83
2  EasyEnsembleClassifier             0.81            0.82 

In [79]:
# Evaluate balanced ensemble classifiers on df1
print('For df1:\n')
fun_split_val_multi_clf_all_tasks('None',be_clf_list, be_clf_name_list,X_train1,y_train1,X_val1,y_val1,X_test1,y_test1,'data_source')
For df1:

overall_accuracy from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.82            0.81
2  EasyEnsembleClassifier             0.72            0.71
1                RUSBoost             0.62            0.63 

recall_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
1                RUSBoost             0.67            0.61
2  EasyEnsembleClassifier             0.64            0.59
0        Balanced_Bagging             0.51            0.43 

recall_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.86            0.85
2  EasyEnsembleClassifier             0.72            0.72
1                RUSBoost             0.61            0.64 

precision_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.29            0.24
2  EasyEnsembleClassifier             0.21            0.18
1                RUSBoost             0.16            0.15 

precision_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
2  EasyEnsembleClassifier             0.95            0.94
0        Balanced_Bagging             0.94            0.93
1                RUSBoost             0.94            0.94 

f1_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.37            0.31
2  EasyEnsembleClassifier             0.31            0.28
1                RUSBoost             0.26            0.24 

f1_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.90            0.89
2  EasyEnsembleClassifier             0.82            0.82
1                RUSBoost             0.74            0.76 

In [80]:
# Evaluate balanced ensemble classifiers on df2
print('For df2:\n')
fun_split_val_multi_clf_all_tasks('None',be_clf_list, be_clf_name_list,X_train2,y_train2,X_val2,y_val2,X_test2,y_test2,'data_source')
For df2:

overall_accuracy from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.83            0.84
1                RUSBoost             0.78            0.79
2  EasyEnsembleClassifier             0.68            0.72 

recall_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
2  EasyEnsembleClassifier             0.67            0.64
1                RUSBoost             0.57            0.53
0        Balanced_Bagging             0.44            0.45 

recall_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.85            0.86
1                RUSBoost             0.78            0.81
2  EasyEnsembleClassifier             0.68            0.72 

precision_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.10            0.12
1                RUSBoost             0.09            0.11
2  EasyEnsembleClassifier             0.07            0.09 

precision_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.98            0.97
1                RUSBoost             0.98            0.98
2  EasyEnsembleClassifier             0.98            0.98 

f1_yes from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.16            0.19
1                RUSBoost             0.15            0.18
2  EasyEnsembleClassifier             0.13            0.16 

f1_no from test (left) and from validation (right)

                 clf_name  test_set_result  val_set_result
0        Balanced_Bagging             0.91            0.91
1                RUSBoost             0.87            0.88
2  EasyEnsembleClassifier             0.81            0.83 

Conclusion

When selecting the best combination of sampling method and classifier from above, the key to remember is that we care most about minimizing type II error for the "Yes" class. In other words we want to minimize the number of actual lemons that the model incorrectly classifies as non-lemons. Granted, it is important be mindful of the tradeoffs. The random forest (rf_max) classifier combined with the "NearMiss2" under sampling technique provides reasonable results for recall on the "Yes" class. But the tradeoff for precision should be be noted.

In [81]:
!cp "/content/drive/MyDrive/Colab Notebooks/car-auction-resampling.ipynb" ./
In [82]:
!jupyter nbconvert --to html "car-auction-resampling.ipynb"
[NbConvertApp] Converting notebook car-auction-resampling.ipynb to html
[NbConvertApp] Writing 6711759 bytes to car-auction-resampling.html